Merge branch 'develop' into small_matrices
This commit is contained in:
commit
255b6dd0fa
|
@ -0,0 +1,103 @@
|
|||
name: continuous build
|
||||
|
||||
on: [push, pull_request]
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
os: [ubuntu-latest, macos-latest]
|
||||
fortran: [gfortran, flang]
|
||||
build: [cmake, make]
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: Compilation cache
|
||||
uses: actions/cache@v2
|
||||
with:
|
||||
path: ~/.ccache
|
||||
# We include the commit sha in the cache key, as new cache entries are
|
||||
# only created if there is no existing entry for the key yet.
|
||||
key: ${{ runner.os }}-ccache-${{ github.sha }}
|
||||
# Restore any ccache cache entry, if none for
|
||||
# ${{ runner.os }}-ccache-${{ github.sha }} exists
|
||||
restore-keys: |
|
||||
${{ runner.os }}-ccache-
|
||||
|
||||
- name: Print system information
|
||||
run: |
|
||||
if [ "$RUNNER_OS" == "Linux" ]; then
|
||||
cat /proc/cpuinfo
|
||||
elif [ "$RUNNER_OS" == "macOS" ]; then
|
||||
sysctl -a | grep machdep.cpu
|
||||
else
|
||||
echo "$RUNNER_OS not supported"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Install Dependencies
|
||||
run: |
|
||||
if [ "$RUNNER_OS" == "Linux" ]; then
|
||||
sudo apt-get install -y gfortran cmake ccache
|
||||
elif [ "$RUNNER_OS" == "macOS" ]; then
|
||||
brew install coreutils cmake ccache
|
||||
else
|
||||
echo "$RUNNER_OS not supported"
|
||||
exit 1
|
||||
fi
|
||||
ccache -M 300M # Limit the ccache size; Github's overall cache limit is 5GB
|
||||
|
||||
- name: gfortran build
|
||||
if: matrix.build == 'make' && matrix.fortran == 'gfortran'
|
||||
run: |
|
||||
if [ "$RUNNER_OS" == "Linux" ]; then
|
||||
export PATH="/usr/lib/ccache:${PATH}"
|
||||
elif [ "$RUNNER_OS" == "macOS" ]; then
|
||||
export PATH="$(brew --prefix)/opt/ccache/libexec:${PATH}"
|
||||
else
|
||||
echo "$RUNNER_OS not supported"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0
|
||||
|
||||
- name: flang build
|
||||
if: matrix.build == 'make' && matrix.fortran == 'flang'
|
||||
run: |
|
||||
if [ "$RUNNER_OS" == "Linux" ]; then
|
||||
export PATH="/usr/lib/ccache:${PATH}"
|
||||
elif [ "$RUNNER_OS" == "macOS" ]; then
|
||||
exit 0
|
||||
else
|
||||
echo "$RUNNER_OS not supported"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
cd /usr/
|
||||
sudo wget -nv https://github.com/flang-compiler/flang/releases/download/flang_20190329/flang-20190329-x86-70.tgz
|
||||
sudo tar xf flang-20190329-x86-70.tgz
|
||||
sudo rm flang-20190329-x86-70.tgz
|
||||
cd -
|
||||
|
||||
make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0 FC=flang
|
||||
|
||||
|
||||
- name: CMake gfortran build
|
||||
if: matrix.build == 'cmake' && matrix.fortran == 'gfortran'
|
||||
run: |
|
||||
if [ "$RUNNER_OS" == "Linux" ]; then
|
||||
export PATH="/usr/lib/ccache:${PATH}"
|
||||
elif [ "$RUNNER_OS" == "macOS" ]; then
|
||||
export PATH="$(brew --prefix)/opt/ccache/libexec:${PATH}"
|
||||
else
|
||||
echo "$RUNNER_OS not supported"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -DDYNAMIC_ARCH=1 -DNOFORTRAN=0 -DBUILD_WITHOUT_LAPACK=0 -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_BUILD_TYPE=Release ..
|
||||
make -j$(nproc)
|
|
@ -21,6 +21,7 @@ jobs:
|
|||
build-OpenBLAS-with-Homebrew:
|
||||
runs-on: macos-latest
|
||||
env:
|
||||
DEVELOPER_DIR: /Applications/Xcode_11.4.1.app/Contents/Developer
|
||||
HOMEBREW_DEVELOPER: "ON"
|
||||
HOMEBREW_DISPLAY_INSTALL_TIMES: "ON"
|
||||
HOMEBREW_NO_ANALYTICS: "ON"
|
||||
|
|
|
@ -70,6 +70,7 @@ test/SBLAT2.SUMM
|
|||
test/SBLAT3.SUMM
|
||||
test/ZBLAT2.SUMM
|
||||
test/ZBLAT3.SUMM
|
||||
test/SHBLAT3.SUMM
|
||||
test/cblat1
|
||||
test/cblat2
|
||||
test/cblat3
|
||||
|
@ -79,6 +80,7 @@ test/dblat3
|
|||
test/sblat1
|
||||
test/sblat2
|
||||
test/sblat3
|
||||
test/test_shgemm
|
||||
test/zblat1
|
||||
test/zblat2
|
||||
test/zblat3
|
||||
|
|
20
.travis.yml
20
.travis.yml
|
@ -16,7 +16,6 @@ matrix:
|
|||
before_script: &common-before
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
|
||||
script:
|
||||
- set -e
|
||||
- make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
- make -C test $COMMON_FLAGS $BTYPE
|
||||
- make -C ctest $COMMON_FLAGS $BTYPE
|
||||
|
@ -76,6 +75,23 @@ matrix:
|
|||
- TARGET_BOX=LINUX32
|
||||
- BTYPE="BINARY=32"
|
||||
|
||||
- os: linux
|
||||
arch: ppc64le
|
||||
dist: bionic
|
||||
compiler: gcc
|
||||
before_script:
|
||||
- sudo add-apt-repository 'ppa:ubuntu-toolchain-r/test' -y
|
||||
- sudo apt-get update
|
||||
- sudo apt-get install gcc-9 gfortran-9 -y
|
||||
script:
|
||||
- make QUIET_MAKE=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9
|
||||
- make -C test $COMMON_FLAGS $BTYPE
|
||||
- make -C ctest $COMMON_FLAGS $BTYPE
|
||||
- make -C utest $COMMON_FLAGS $BTYPE
|
||||
env:
|
||||
# for matrix annotation only
|
||||
- TARGET_BOX=PPC64LE_LINUX_P9
|
||||
|
||||
- os: linux
|
||||
compiler: gcc
|
||||
addons:
|
||||
|
@ -108,7 +124,6 @@ matrix:
|
|||
- sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers'
|
||||
before_script: *common-before
|
||||
script:
|
||||
- set -e
|
||||
# XXX: Disable some warnings for now to avoid exceeding Travis limit for log size.
|
||||
- alpine make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
CFLAGS="-Wno-misleading-indentation -Wno-sign-conversion -Wno-incompatible-pointer-types"
|
||||
|
@ -151,7 +166,6 @@ matrix:
|
|||
before_script:
|
||||
- COMMON_ARGS="-DTARGET=NEHALEM -DNUM_THREADS=32"
|
||||
script:
|
||||
- set -e
|
||||
- mkdir build
|
||||
- CONFIG=Release
|
||||
- cmake -Bbuild -H. $CMAKE_ARGS $COMMON_ARGS -DCMAKE_BUILD_TYPE=$CONFIG
|
||||
|
|
|
@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
|
|||
project(OpenBLAS C ASM)
|
||||
set(OpenBLAS_MAJOR_VERSION 0)
|
||||
set(OpenBLAS_MINOR_VERSION 3)
|
||||
set(OpenBLAS_PATCH_VERSION 9.dev)
|
||||
set(OpenBLAS_PATCH_VERSION 10.dev)
|
||||
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
|
||||
|
||||
# Adhere to GNU filesystem layout conventions
|
||||
|
@ -23,6 +23,7 @@ option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS fun
|
|||
option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF)
|
||||
option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF)
|
||||
option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF)
|
||||
option(USE_LOCKING "Use locks even in single-threaded builds to make them callable from multiple threads" OFF)
|
||||
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
|
||||
option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON)
|
||||
else()
|
||||
|
@ -86,10 +87,13 @@ if (NOT NO_LAPACK)
|
|||
list(APPEND SUBDIRS lapack)
|
||||
endif ()
|
||||
|
||||
if (NOT DEFINED BUILD_HALF)
|
||||
set (BUILD_HALF false)
|
||||
endif ()
|
||||
# set which float types we want to build for
|
||||
if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16)
|
||||
# if none are defined, build for all
|
||||
set(BUILD_HALF true)
|
||||
# set(BUILD_HALF true)
|
||||
set(BUILD_SINGLE true)
|
||||
set(BUILD_DOUBLE true)
|
||||
set(BUILD_COMPLEX true)
|
||||
|
@ -121,7 +125,7 @@ if (BUILD_COMPLEX16)
|
|||
list(APPEND FLOAT_TYPES "ZCOMPLEX") # defines COMPLEX and DOUBLE
|
||||
endif ()
|
||||
|
||||
if (BUILD_SINGLE OR BUILD_HALF)
|
||||
if (BUILD_HALF)
|
||||
message(STATUS "Building Half Precision")
|
||||
list(APPEND FLOAT_TYPES "HALF") # defines nothing
|
||||
endif ()
|
||||
|
@ -229,6 +233,7 @@ if (NOT MSVC AND NOT NOFORTRAN)
|
|||
if(NOT NO_CBLAS)
|
||||
add_subdirectory(ctest)
|
||||
endif()
|
||||
add_subdirectory(lapack-netlib/TESTING)
|
||||
endif()
|
||||
|
||||
set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES
|
||||
|
@ -244,7 +249,7 @@ if (BUILD_SHARED_LIBS AND BUILD_RELAPACK)
|
|||
endif()
|
||||
endif()
|
||||
|
||||
if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFIX} STREQUAL "")
|
||||
if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "")
|
||||
if (NOT DEFINED ARCH)
|
||||
set(ARCH_IN "x86_64")
|
||||
else()
|
||||
|
@ -353,10 +358,21 @@ endif()
|
|||
|
||||
if(NOT NO_CBLAS)
|
||||
message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
|
||||
|
||||
set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h)
|
||||
file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS)
|
||||
string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
|
||||
if (NOT ${SYMBOLPREFIX} STREQUAL "")
|
||||
string(REPLACE " cblas" " ${SYMBOLPREFIX}cblas" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}")
|
||||
string(REPLACE " openblas" " ${SYMBOLPREFIX}openblas" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
|
||||
string (REPLACE " ${SYMBOLPREFIX}openblas_complex" " openblas_complex" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}")
|
||||
string(REPLACE " goto" " ${SYMBOLPREFIX}goto" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
|
||||
endif()
|
||||
if (NOT ${SYMBOLSUFFIX} STREQUAL "")
|
||||
string(REGEX REPLACE "(cblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}")
|
||||
string(REGEX REPLACE "(openblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
|
||||
string(REGEX REPLACE "(openblas_complex[^ ]*)${SYMBOLSUFFIX}" "\\1" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}")
|
||||
string(REGEX REPLACE "(goto[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
|
||||
endif()
|
||||
file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}")
|
||||
install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
||||
endif()
|
||||
|
@ -373,11 +389,9 @@ if(NOT NO_LAPACKE)
|
|||
install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64})
|
||||
endif()
|
||||
|
||||
include(FindPkgConfig QUIET)
|
||||
if(PKG_CONFIG_FOUND)
|
||||
configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc @ONLY)
|
||||
install (FILES ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/)
|
||||
endif()
|
||||
# Install pkg-config files
|
||||
configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc @ONLY)
|
||||
install (FILES ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/)
|
||||
|
||||
|
||||
# GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share".
|
||||
|
|
|
@ -180,3 +180,13 @@ In chronological order:
|
|||
* [2019-12-23] optimize AVX2 CGEMM and ZGEMM
|
||||
* [2019-12-30] AVX2 CGEMM3M & ZGEMM3M kernels
|
||||
* [2020-01-07] optimize AVX2 SGEMM and STRMM
|
||||
|
||||
* Rajalakshmi Srinivasaraghavan <https://github.com/RajalakshmiSR>
|
||||
* [2020-04-15] Half-precision GEMM for bfloat16
|
||||
|
||||
* Marius Hillenbrand <https://github.com/mhillenibm>
|
||||
* [2020-05-12] Revise dynamic architecture detection for IBM z
|
||||
* [2020-05-12] Add new sgemm and strmm kernel for IBM z14
|
||||
|
||||
* Danfeng Zhang <https://github.com/craft-zhang>
|
||||
* [2020-05-20] Improve performance of SGEMM and STRMM on Arm Cortex-A53
|
|
@ -1,4 +1,77 @@
|
|||
OpenBLAS ChangeLog
|
||||
====================================================================
|
||||
Version 0.3.10
|
||||
14-Jun-2020
|
||||
|
||||
common:
|
||||
* Improved thread locking behaviour in blas_server and parallel getrf
|
||||
* Imported bugfix 394 from LAPACK (spurious reference to "XERBL"
|
||||
due to overlong lines)
|
||||
* Imported bugfix 403 from LAPACK (compile option "recursive" required
|
||||
for correctness with Intel and PGI)
|
||||
* Imported bugfix 408 from LAPACK (wrong scaling in ZHEEQUB)
|
||||
* Imported bugfix 411 from LAPACK (infinite loop in LARGV/LARTG/LARTGP)
|
||||
* Fixed mismatches between BUFFERSIZE and GEMM_UNROLL parameters that
|
||||
could lead to crashes at large matrix sizes
|
||||
* Restored internal soname in dynamic libraries on FreeBSD and Dragonfly
|
||||
* Added API (openblas_setaffinity) to set the thread affinity on Linux
|
||||
* Added initial infrastructure for half-precision floating point
|
||||
(bfloat16) support with a generic implementation of SHGEMM
|
||||
* Added CMAKE build system support for building the cblas_Xgemm3m
|
||||
functions
|
||||
* Fixed CMAKE support for building in a path with embedded spaces
|
||||
* Fixed CMAKE (non)handling of NO_EXPRECISION and MAX_STACK_ALLOC
|
||||
* Fixed GCC version detection in the Makefiles
|
||||
* Allowed overriding the names of AR, AS and LD in Makefile builds
|
||||
|
||||
POWER:
|
||||
* Fixed big-endian POWER8 ELFv2 builds on FreeBSD
|
||||
* Fixed GCC version checks and DYNAMIC_ARCH builds on POWER9
|
||||
* Fixed CMAKE build support for POWER9
|
||||
* fixed a potential race condition in the thread buffer allocation
|
||||
* Worked around LAPACK test failures on PPC G4
|
||||
|
||||
MIPS:
|
||||
* Fixed a potential race condition in the thread buffer allocation
|
||||
* Added support for MIPS 24K/24KE family based on P5600 kernels
|
||||
|
||||
MIPS64:
|
||||
* fixed a potential race condition in the thread buffer allocation
|
||||
* Added TARGET=GENERIC
|
||||
|
||||
ARMV7:
|
||||
* Fixed a race condition in the thread buffer allocation
|
||||
|
||||
ARMV8:
|
||||
* Fixed a race condition in the thread buffer allocation
|
||||
* Fixed zero initialisation in the assembly for SGEMM and DGEMM BETA
|
||||
* Improved performance of the ThunderX2 DAXPY kernel
|
||||
* Added an optimized SGEMM kernel for Cortex A53
|
||||
* Fixed Makefile support for INTERFACE64 (8-byte integer)
|
||||
|
||||
x86_64:
|
||||
* Fixed a syntax error in the CMAKE setup for SkylakeX
|
||||
* Improved performance of STRSM on Haswell, SkylakeX and Ryzen
|
||||
* Improved SGEMM performance on SGEMM for workloads with ldc a
|
||||
multiple of 1024
|
||||
* Improved DGEMM performance on Skylake X
|
||||
* Fixed unwanted AVX512-dependency of SGEMM in DYNAMIC_ARCH
|
||||
builds created on SkylakeX
|
||||
* Removed data alignment requirement in the SSE2 copy kernels
|
||||
that could cause spurious crashes
|
||||
* Added a workaround for an optimizer bug in AppleClang 11.0.3
|
||||
* Fixed LAPACK test failures due to wrong options for Intel Fortran
|
||||
* Fixed compilation and LAPACK test results with recent Flang
|
||||
and AMD AOCC
|
||||
* Fixed DYNAMIC_ARCH builds with CMAKE on OS X
|
||||
* Fixed missing exports of cblas_i?amin, cblas_i?min, cblas_i?max,
|
||||
cblas_?sum, cblas_?gemm3m in the shared library on OS
|
||||
* Fixed reporting of cpu name in DYNAMIC_ARCH builds (would sometimes
|
||||
show the name of an older generation chip supported by the same kernels)
|
||||
|
||||
IBM Z:
|
||||
* Improved performance of SGEMM/STRMM and DGEMM/DTRMM on Z14
|
||||
|
||||
====================================================================
|
||||
Version 0.3.9
|
||||
1-Mar-2020
|
||||
|
|
|
@ -0,0 +1,9 @@
|
|||
node {
|
||||
stage('Checkout') {
|
||||
checkout
|
||||
}
|
||||
|
||||
stage('Build') {
|
||||
sh("make")
|
||||
}
|
||||
}
|
8
Makefile
8
Makefile
|
@ -141,7 +141,7 @@ ifndef NO_FBLAS
|
|||
$(MAKE) -C test all
|
||||
endif
|
||||
$(MAKE) -C utest all
|
||||
ifndef NO_CBLAS
|
||||
ifneq ($(NO_CBLAS), 1)
|
||||
$(MAKE) -C ctest all
|
||||
ifeq ($(CPP_THREAD_SAFETY_TEST), 1)
|
||||
$(MAKE) -C cpp_thread_test all
|
||||
|
@ -244,7 +244,7 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
|||
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib
|
||||
@$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib
|
||||
endif
|
||||
ifndef NO_LAPACKE
|
||||
ifneq ($(NO_LAPACKE), 1)
|
||||
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapackelib
|
||||
endif
|
||||
endif
|
||||
|
@ -264,6 +264,7 @@ lapack_prebuild :
|
|||
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
||||
-@echo "FC = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "FFLAGS_DRV = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
|
@ -364,11 +365,12 @@ clean ::
|
|||
@$(MAKE) -C kernel clean
|
||||
#endif
|
||||
@$(MAKE) -C reference clean
|
||||
@rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h
|
||||
@rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h *.so.renamed *.a.renamed *.so.0
|
||||
ifeq ($(OSNAME), Darwin)
|
||||
@rm -rf getarch.dSYM getarch_2nd.dSYM
|
||||
endif
|
||||
@rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib
|
||||
@rm -f cblas.tmp cblas.tmp2
|
||||
@touch $(NETLIB_LAPACK_DIR)/make.inc
|
||||
@$(MAKE) -C $(NETLIB_LAPACK_DIR) clean
|
||||
@rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h
|
||||
|
|
|
@ -56,6 +56,16 @@ CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
|||
FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), THUNDERX3T110)
|
||||
ifeq ($(GCCVERSIONGTEQ10), 1)
|
||||
CCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110
|
||||
FCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
||||
FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(GCCVERSIONGTEQ9), 1)
|
||||
ifeq ($(CORE), TSV110)
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
|
||||
|
|
|
@ -13,6 +13,14 @@ OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/openblas
|
|||
OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake
|
||||
OPENBLAS_CMAKE_CONFIG_VERSION := OpenBLASConfigVersion.cmake
|
||||
OPENBLAS_PKGCONFIG_DIR := $(OPENBLAS_LIBRARY_DIR)/pkgconfig
|
||||
PKG_EXTRALIB := $(EXTRALIB)
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
ifeq ($(C_COMPILER), PGI)
|
||||
PKG_EXTRALIB += -lomp
|
||||
else
|
||||
PKG_EXTRALIB += -lgomp
|
||||
endif
|
||||
endif
|
||||
|
||||
.PHONY : install
|
||||
.NOTPARALLEL : install
|
||||
|
@ -45,7 +53,22 @@ install : lib.grd
|
|||
|
||||
ifndef NO_CBLAS
|
||||
@echo Generating cblas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
|
||||
@sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h"
|
||||
@cp cblas.h cblas.tmp
|
||||
ifdef SYMBOLPREFIX
|
||||
@sed 's/cblas[^( ]*/$(SYMBOLPREFIX)&/g' cblas.tmp > cblas.tmp2
|
||||
@sed 's/openblas[^( ]*/$(SYMBOLPREFIX)&/g' cblas.tmp2 > cblas.tmp
|
||||
#change back any openblas_complex_float and double that got hit
|
||||
@sed 's/$(SYMBOLPREFIX)openblas_complex_/openblas_complex_/g' cblas.tmp > cblas.tmp2
|
||||
@sed 's/goto[^( ]*/$(SYMBOLPREFIX)&/g' cblas.tmp2 > cblas.tmp
|
||||
endif
|
||||
ifdef SYMBOLSUFFIX
|
||||
@sed 's/cblas[^( ]*/&$(SYMBOLSUFFIX)/g' cblas.tmp > cblas.tmp2
|
||||
@sed 's/openblas[^( ]*/&$(SYMBOLSUFFIX)/g' cblas.tmp2 > cblas.tmp
|
||||
#change back any openblas_complex_float and double that got hit
|
||||
@sed 's/\(openblas_complex_\)\([^ ]*\)$(SYMBOLSUFFIX)/\1\2 /g' cblas.tmp > cblas.tmp2
|
||||
@sed 's/goto[^( ]*/&$(SYMBOLSUFFIX)/g' cblas.tmp2 > cblas.tmp
|
||||
endif
|
||||
@sed 's/common/openblas_config/g' cblas.tmp > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h"
|
||||
endif
|
||||
|
||||
ifneq ($(OSNAME), AIX)
|
||||
|
@ -132,7 +155,7 @@ endif
|
|||
@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||
@echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||
@echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||
@echo 'extralib='$(EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||
@echo 'extralib='$(PKG_EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||
@cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||
|
||||
|
||||
|
@ -168,4 +191,3 @@ endif
|
|||
@echo " endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
|
||||
@echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
|
||||
@echo Install OK!
|
||||
|
||||
|
|
|
@ -9,23 +9,63 @@ else
|
|||
USE_OPENMP = 1
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), POWER10)
|
||||
COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
|
||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), POWER9)
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
|
||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
CCOMMON_OPT += -Ofast -mvsx -fno-fast-math
|
||||
ifneq ($(GCCVERSIONGT4), 1)
|
||||
$(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended)
|
||||
CCOMMON_OPT += -mcpu=power8 -mtune=power8
|
||||
else
|
||||
COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -fno-fast-math
|
||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -fno-fast-math
|
||||
CCOMMON_OPT += -mcpu=power9 -mtune=power9
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -fast -Mvect=simd -Mcache_align
|
||||
endif
|
||||
ifneq ($(F_COMPILER), PGI)
|
||||
FCOMMON_OPT += -O2 -frecursive -fno-fast-math
|
||||
ifneq ($(GCCVERSIONGT4), 1)
|
||||
$(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended)
|
||||
FCOMMON_OPT += -mcpu=power8 -mtune=power8
|
||||
else
|
||||
FCOMMON_OPT += -mcpu=power9 -mtune=power9
|
||||
endif
|
||||
else
|
||||
FCOMMON_OPT += -O2 -Mrecursive
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), POWER8)
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
|
||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math
|
||||
else
|
||||
COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -fno-fast-math
|
||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math
|
||||
CCOMMON_OPT += -fast -Mvect=simd -Mcache_align
|
||||
endif
|
||||
ifneq ($(F_COMPILER), PGI)
|
||||
ifeq ($(OSNAME), AIX)
|
||||
FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math
|
||||
else
|
||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math
|
||||
endif
|
||||
else
|
||||
FCOMMON_OPT += -O2 -Mrecursive
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
CCOMMON_OPT += -DUSE_OPENMP -fopenmp
|
||||
else
|
||||
CCOMMON_OPT += -DUSE_OPENMP -mp
|
||||
endif
|
||||
ifneq ($(F_COMPILER), PGI)
|
||||
FCOMMON_OPT += -DUSE_OPENMP -fopenmp
|
||||
else
|
||||
FCOMMON_OPT += -DUSE_OPENMP -mp
|
||||
endif
|
||||
endif
|
||||
|
||||
|
@ -68,6 +108,9 @@ CCOMMON_OPT += -mpowerpc64 -maix64
|
|||
ifeq ($(COMPILER_F77), g77)
|
||||
FCOMMON_OPT += -mpowerpc64 -maix64
|
||||
endif
|
||||
ifeq ($(F_COMPILER), GFORTRAN)
|
||||
FCOMMON_OPT += -mpowerpc64 -maix64
|
||||
endif
|
||||
ifeq ($(COMPILER_F77), xlf)
|
||||
FCOMMON_OPT += -q64
|
||||
endif
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
#
|
||||
|
||||
# This library's version
|
||||
VERSION = 0.3.9.dev
|
||||
VERSION = 0.3.10.dev
|
||||
|
||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||
|
@ -273,6 +273,9 @@ COMMON_PROF = -pg
|
|||
#
|
||||
# CPP_THREAD_SAFETY_TEST = 1
|
||||
|
||||
|
||||
# If you want to enable the experimental BFLOAT16 support
|
||||
# BUILD_HALF = 1
|
||||
#
|
||||
# End of user configuration
|
||||
#
|
||||
|
|
135
Makefile.system
135
Makefile.system
|
@ -21,8 +21,14 @@ ifeq ($(ARCH), amd64)
|
|||
override ARCH=x86_64
|
||||
else ifeq ($(ARCH), powerpc64)
|
||||
override ARCH=power
|
||||
else ifeq ($(ARCH), powerpc)
|
||||
override ARCH=power
|
||||
else ifeq ($(ARCH), i386)
|
||||
override ARCH=x86
|
||||
else ifeq ($(ARCH), armv6)
|
||||
override ARCH=arm
|
||||
else ifeq ($(ARCH), armv7)
|
||||
override ARCH=arm
|
||||
else ifeq ($(ARCH), aarch64)
|
||||
override ARCH=arm64
|
||||
else ifeq ($(ARCH), zarch)
|
||||
|
@ -86,6 +92,9 @@ endif
|
|||
ifeq ($(TARGET), SKYLAKEX)
|
||||
GETARCH_FLAGS := -DFORCE_NEHALEM
|
||||
endif
|
||||
ifeq ($(TARGET), COOPERLAKE)
|
||||
GETARCH_FLAGS := -DFORCE_NEHALEM
|
||||
endif
|
||||
ifeq ($(TARGET), SANDYBRIDGE)
|
||||
GETARCH_FLAGS := -DFORCE_NEHALEM
|
||||
endif
|
||||
|
@ -107,6 +116,9 @@ endif
|
|||
ifeq ($(TARGET), ARMV8)
|
||||
GETARCH_FLAGS := -DFORCE_ARMV7
|
||||
endif
|
||||
ifeq ($(TARGET), POWER8)
|
||||
GETARCH_FLAGS := -DFORCE_POWER6
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
|
@ -125,6 +137,9 @@ endif
|
|||
ifeq ($(TARGET_CORE), SKYLAKEX)
|
||||
GETARCH_FLAGS := -DFORCE_NEHALEM
|
||||
endif
|
||||
ifeq ($(TARGET_CORE), COOPERLAKE)
|
||||
GETARCH_FLAGS := -DFORCE_NEHALEM
|
||||
endif
|
||||
ifeq ($(TARGET_CORE), SANDYBRIDGE)
|
||||
GETARCH_FLAGS := -DFORCE_NEHALEM
|
||||
endif
|
||||
|
@ -266,10 +281,10 @@ endif
|
|||
|
||||
ARFLAGS =
|
||||
CPP = $(COMPILER) -E
|
||||
AR = $(CROSS_SUFFIX)ar
|
||||
AS = $(CROSS_SUFFIX)as
|
||||
LD = $(CROSS_SUFFIX)ld
|
||||
RANLIB = $(CROSS_SUFFIX)ranlib
|
||||
AR ?= $(CROSS_SUFFIX)ar
|
||||
AS ?= $(CROSS_SUFFIX)as
|
||||
LD ?= $(CROSS_SUFFIX)ld
|
||||
RANLIB ?= $(CROSS_SUFFIX)ranlib
|
||||
NM = $(CROSS_SUFFIX)nm
|
||||
DLLWRAP = $(CROSS_SUFFIX)dllwrap
|
||||
OBJCOPY = $(CROSS_SUFFIX)objcopy
|
||||
|
@ -282,6 +297,26 @@ NO_LAPACK = 1
|
|||
override FEXTRALIB =
|
||||
endif
|
||||
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
|
||||
GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4)
|
||||
GCCVERSIONEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` = 5)
|
||||
GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5)
|
||||
GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7)
|
||||
GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
|
||||
GCCVERSIONGTEQ11 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 11)
|
||||
GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10)
|
||||
# Note that the behavior of -dumpversion is compile-time-configurable for
|
||||
# gcc-7.x and newer. Use -dumpfullversion there
|
||||
ifeq ($(GCCVERSIONGTEQ7),1)
|
||||
GCCDUMPVERSION_PARAM := -dumpfullversion
|
||||
else
|
||||
GCCDUMPVERSION_PARAM := -dumpversion
|
||||
endif
|
||||
GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 2)
|
||||
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7)
|
||||
endif
|
||||
|
||||
#
|
||||
# OS dependent settings
|
||||
#
|
||||
|
@ -328,13 +363,7 @@ ifeq ($(C_COMPILER), CLANG)
|
|||
CCOMMON_OPT += -DMS_ABI
|
||||
endif
|
||||
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
#Version tests for supporting specific features (MS_ABI, POWER9 intrinsics)
|
||||
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
|
||||
GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4)
|
||||
GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7)
|
||||
GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
|
||||
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
|
||||
ifeq ($(GCCVERSIONGT4), 1)
|
||||
# GCC Major version > 4
|
||||
# It is compatible with MSVC ABI.
|
||||
|
@ -348,7 +377,6 @@ ifeq ($(GCCMINORVERSIONGTEQ7), 1)
|
|||
CCOMMON_OPT += -DMS_ABI
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
# Ensure the correct stack alignment on Win32
|
||||
# http://permalink.gmane.org/gmane.comp.lib.openblas.general/97
|
||||
|
@ -540,7 +568,7 @@ DYNAMIC_CORE += HASWELL ZEN
|
|||
endif
|
||||
ifneq ($(NO_AVX512), 1)
|
||||
ifneq ($(NO_AVX2), 1)
|
||||
DYNAMIC_CORE += SKYLAKEX
|
||||
DYNAMIC_CORE += SKYLAKEX COOPERLAKE
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
@ -565,11 +593,38 @@ DYNAMIC_CORE += THUNDERX
|
|||
DYNAMIC_CORE += THUNDERX2T99
|
||||
DYNAMIC_CORE += TSV110
|
||||
DYNAMIC_CORE += EMAG8180
|
||||
DYNAMIC_CORE += THUNDERX3T110
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), zarch)
|
||||
DYNAMIC_CORE = Z13
|
||||
DYNAMIC_CORE = ZARCH_GENERIC
|
||||
|
||||
# Z13 is supported since gcc-5.2, gcc-6, and in RHEL 7.3 and newer
|
||||
ifeq ($(GCCVERSIONGT5), 1)
|
||||
ZARCH_SUPPORT_Z13 := 1
|
||||
else ifeq ($(GCCVERSIONEQ5), 1)
|
||||
ifeq ($(GCCMINORVERSIONGTEQ2), 1)
|
||||
ZARCH_SUPPORT_Z13 := 1
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(wildcard /etc/redhat-release), /etc/redhat-release)
|
||||
ifeq ($(shell source /etc/os-release ; expr $$VERSION_ID \>= "7.3"), 1)
|
||||
ZARCH_SUPPORT_Z13 := 1
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(ZARCH_SUPPORT_Z13), 1)
|
||||
DYNAMIC_CORE += Z13
|
||||
else
|
||||
$(info OpenBLAS: Not building Z13 kernels because gcc is older than 5.2 or 6.x)
|
||||
endif
|
||||
|
||||
ifeq ($(GCCVERSIONGTEQ7), 1)
|
||||
DYNAMIC_CORE += Z14
|
||||
else
|
||||
$(info OpenBLAS: Not building Z14 kernels because gcc is older than 7.x)
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), power)
|
||||
|
@ -577,14 +632,23 @@ DYNAMIC_CORE = POWER6
|
|||
DYNAMIC_CORE += POWER8
|
||||
ifneq ($(C_COMPILER), GCC)
|
||||
DYNAMIC_CORE += POWER9
|
||||
DYNAMIC_CORE += POWER10
|
||||
endif
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5)
|
||||
ifeq ($(GCCVERSIONGT5), 1)
|
||||
DYNAMIC_CORE += POWER9
|
||||
else
|
||||
$(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.)
|
||||
endif
|
||||
ifeq ($(GCCVERSIONGTEQ11), 1)
|
||||
DYNAMIC_CORE += POWER10
|
||||
else ifeq ($(GCCVERSIONGTEQ10), 1)
|
||||
ifeq ($(GCCMINORVERSIONGTEQ2), 1)
|
||||
DYNAMIC_CORE += POWER10
|
||||
endif
|
||||
else
|
||||
$(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.)
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
|
@ -745,8 +809,19 @@ endif
|
|||
|
||||
ifeq ($(C_COMPILER), PGI)
|
||||
ifdef BINARY64
|
||||
ifeq ($(ARCH), x86_64)
|
||||
CCOMMON_OPT += -tp p7-64 -D__MMX__ -Mnollvm
|
||||
else
|
||||
ifeq ($(ARCH), power)
|
||||
ifeq ($(CORE), POWER8)
|
||||
CCOMMON_OPT += -tp pwr8
|
||||
endif
|
||||
ifeq ($(CORE), POWER9)
|
||||
CCOMMON_OPT += -tp pwr9
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -tp p7
|
||||
endif
|
||||
endif
|
||||
|
@ -765,6 +840,15 @@ endif
|
|||
|
||||
ifeq ($(F_COMPILER), FLANG)
|
||||
CCOMMON_OPT += -DF_INTERFACE_FLANG
|
||||
FCOMMON_OPT += -Mrecursive -Kieee
|
||||
ifeq ($(OSNAME), Linux)
|
||||
ifeq ($(ARCH), x86_64)
|
||||
FLANG_VENDOR := $(shell expr `$(FC) --version|cut -f 1 -d "."|head -1`)
|
||||
ifeq ($(FLANG_VENDOR),AOCC)
|
||||
FCOMMON_OPT += -fno-unroll-loops
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
ifdef BINARY64
|
||||
ifdef INTERFACE64
|
||||
ifneq ($(INTERFACE64), 0)
|
||||
|
@ -860,7 +944,7 @@ ifneq ($(INTERFACE64), 0)
|
|||
FCOMMON_OPT += -i8
|
||||
endif
|
||||
endif
|
||||
FCOMMON_OPT += -recursive
|
||||
FCOMMON_OPT += -recursive -fp-model strict -assume protect-parens
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
FCOMMON_OPT += -fopenmp
|
||||
endif
|
||||
|
@ -900,8 +984,19 @@ ifneq ($(INTERFACE64), 0)
|
|||
FCOMMON_OPT += -i8
|
||||
endif
|
||||
endif
|
||||
ifeq ($(ARCH), x86_64)
|
||||
FCOMMON_OPT += -tp p7-64
|
||||
else
|
||||
ifeq ($(ARCH), power)
|
||||
ifeq ($(CORE), POWER8)
|
||||
FCOMMON_OPT += -tp pwr8
|
||||
endif
|
||||
ifeq ($(CORE), POWER9)
|
||||
FCOMMON_OPT += -tp pwr9
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
else
|
||||
FCOMMON_OPT += -tp p7
|
||||
endif
|
||||
FCOMMON_OPT += -Mrecursive
|
||||
|
@ -1129,6 +1224,10 @@ ifeq ($(USE_TLS), 1)
|
|||
CCOMMON_OPT += -DUSE_TLS
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_HALF), 1)
|
||||
CCOMMON_OPT += -DBUILD_HALF
|
||||
endif
|
||||
|
||||
CCOMMON_OPT += -DVERSION=\"$(VERSION)\"
|
||||
|
||||
ifndef SYMBOLPREFIX
|
||||
|
@ -1155,6 +1254,9 @@ KERNELDIR = $(TOPDIR)/kernel/$(ARCH)
|
|||
|
||||
include $(TOPDIR)/Makefile.$(ARCH)
|
||||
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
CCOMMON_OPT += -UASMNAME -UASMFNAME -UNAME -UCNAME -UCHAR_NAME -UCHAR_CNAME
|
||||
endif
|
||||
CCOMMON_OPT += -DASMNAME=$(FU)$(*F) -DASMFNAME=$(FU)$(*F)$(BU) -DNAME=$(*F)$(BU) -DCNAME=$(*F) -DCHAR_NAME=\"$(*F)$(BU)\" -DCHAR_CNAME=\"$(*F)\"
|
||||
|
||||
ifeq ($(CORE), PPC440)
|
||||
|
@ -1247,7 +1349,6 @@ endif
|
|||
|
||||
override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR)
|
||||
override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF)
|
||||
|
||||
override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT)
|
||||
override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF)
|
||||
#MAKEOVERRIDES =
|
||||
|
@ -1354,6 +1455,7 @@ export ARCH
|
|||
export CORE
|
||||
export LIBCORE
|
||||
export __BYTE_ORDER__
|
||||
export ELF_VERSION
|
||||
export PGCPATH
|
||||
export CONFIG
|
||||
export CC
|
||||
|
@ -1399,6 +1501,7 @@ export KERNELDIR
|
|||
export FUNCTION_PROFILE
|
||||
export TARGET_CORE
|
||||
export NO_AVX512
|
||||
export BUILD_HALF
|
||||
|
||||
export SHGEMM_UNROLL_M
|
||||
export SHGEMM_UNROLL_N
|
||||
|
|
|
@ -27,18 +27,54 @@ endif
|
|||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), COOPERLAKE)
|
||||
ifndef DYNAMIC_ARCH
|
||||
ifndef NO_AVX512
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
# cooperlake support was added in 10.1
|
||||
GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10)
|
||||
GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 1)
|
||||
ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11)
|
||||
CCOMMON_OPT += -march=cooperlake
|
||||
FCOMMON_OPT += -march=cooperlake
|
||||
endif
|
||||
endif
|
||||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
CCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
FCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
endif
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
CCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
FCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), HASWELL)
|
||||
ifndef DYNAMIC_ARCH
|
||||
ifndef NO_AVX2
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
# AVX2 support was added in 4.7.0
|
||||
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
|
||||
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
|
||||
ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11)
|
||||
CCOMMON_OPT += -mavx2
|
||||
endif
|
||||
endif
|
||||
ifeq ($(F_COMPILER), GFORTRAN)
|
||||
# AVX2 support was added in 4.7.0
|
||||
GCCVERSIONGTEQ4 := $(shell expr `$(FC) -dumpversion | cut -f1 -d.` \>= 4)
|
||||
GCCMINORVERSIONGTEQ7 := $(shell expr `$(FC) -dumpversion | cut -f2 -d.` \>= 7)
|
||||
ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11)
|
||||
FCOMMON_OPT += -mavx2
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -5,6 +5,6 @@ FCOMMON_OPT += -march=z13 -mzvector
|
|||
endif
|
||||
|
||||
ifeq ($(CORE), Z14)
|
||||
CCOMMON_OPT += -march=z14 -mzvector
|
||||
CCOMMON_OPT += -march=z14 -mzvector -O3
|
||||
FCOMMON_OPT += -march=z14 -mzvector
|
||||
endif
|
||||
|
|
|
@ -28,7 +28,8 @@ You can download them from [file hosting on sourceforge.net](https://sourceforge
|
|||
## Installation from Source
|
||||
|
||||
Download from project homepage, https://xianyi.github.com/OpenBLAS/, or check out the code
|
||||
using Git from https://github.com/xianyi/OpenBLAS.git.
|
||||
using Git from https://github.com/xianyi/OpenBLAS.git. (If you want the most up to date version, be
|
||||
sure to use the develop branch - master is several years out of date due to a change of maintainership.)
|
||||
Buildtime parameters can be chosen in Makefile.rule, see there for a short description of each option.
|
||||
Most can also be given directly on the make or cmake command line.
|
||||
|
||||
|
@ -58,6 +59,10 @@ Examples:
|
|||
```sh
|
||||
make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A
|
||||
```
|
||||
or same with the newer mips-crosscompiler put out by Loongson that defaults to the 32bit ABI:
|
||||
```sh
|
||||
make HOSTCC=gcc CC='/opt/mips-loongson-gcc7.3-linux-gnu/2019.06-29/bin/mips-linux-gnu-gcc -mabi=64' FC='/opt/mips-loongson-gcc7.3-linux-gnu/2019.06-29/bin/mips-linux-gnu-gfortran -mabi=64' TARGET=LOONGSON3A
|
||||
```
|
||||
|
||||
* On an x86 box, compile this library for a loongson3a CPU with loongcc (based on Open64) compiler:
|
||||
```sh
|
||||
|
|
|
@ -22,6 +22,7 @@ SANDYBRIDGE
|
|||
HASWELL
|
||||
SKYLAKEX
|
||||
ATOM
|
||||
COOPERLAKE
|
||||
|
||||
b)AMD CPU:
|
||||
ATHLON
|
||||
|
@ -49,6 +50,7 @@ POWER6
|
|||
POWER7
|
||||
POWER8
|
||||
POWER9
|
||||
POWER10
|
||||
PPCG4
|
||||
PPC970
|
||||
PPC970MP
|
||||
|
@ -95,6 +97,7 @@ FALKOR
|
|||
THUNDERX
|
||||
THUNDERX2T99
|
||||
TSV110
|
||||
THUNDERX3T110
|
||||
|
||||
9.System Z:
|
||||
ZARCH_GENERIC
|
||||
|
|
|
@ -49,6 +49,12 @@ else
|
|||
GOTO_LAPACK_TARGETS=
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_HALF),1)
|
||||
GOTO_HALF_TARGETS=shgemm.goto
|
||||
else
|
||||
GOTO_HALF_TARGETS=
|
||||
endif
|
||||
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
|
||||
goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
|
||||
|
@ -91,7 +97,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
|
|||
sgetri.goto dgetri.goto cgetri.goto zgetri.goto \
|
||||
spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \
|
||||
ssymm.goto dsymm.goto csymm.goto zsymm.goto \
|
||||
saxpby.goto daxpby.goto caxpby.goto zaxpby.goto
|
||||
saxpby.goto daxpby.goto caxpby.goto zaxpby.goto $(GOTO_HALF_TARGETS)
|
||||
|
||||
acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
|
||||
scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \
|
||||
|
@ -264,7 +270,7 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \
|
|||
samin.goto damin.goto camin.goto zamin.goto \
|
||||
smin.goto dmin.goto \
|
||||
saxpby.goto daxpby.goto caxpby.goto zaxpby.goto \
|
||||
snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS)
|
||||
snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) $(GOTO_HALF_TARGETS)
|
||||
|
||||
acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
|
||||
scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \
|
||||
|
@ -614,6 +620,11 @@ zcholesky.essl : zcholesky.$(SUFFIX)
|
|||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Sgemm ####################################################
|
||||
ifeq ($(BUILD_HALF),1)
|
||||
shgemm.goto : shgemm.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
endif
|
||||
|
||||
sgemm.goto : sgemm.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
|
@ -1814,7 +1825,7 @@ zsymv.veclib : zsymv.$(SUFFIX)
|
|||
|
||||
##################################### Sgeev ####################################################
|
||||
sgeev.goto : sgeev.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
sgeev.acml : sgeev.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
@ -1830,7 +1841,7 @@ sgeev.veclib : sgeev.$(SUFFIX)
|
|||
|
||||
##################################### Dgeev ####################################################
|
||||
dgeev.goto : dgeev.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
dgeev.acml : dgeev.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
@ -1847,7 +1858,7 @@ dgeev.veclib : dgeev.$(SUFFIX)
|
|||
##################################### Cgeev ####################################################
|
||||
|
||||
cgeev.goto : cgeev.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
cgeev.acml : cgeev.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
@ -1864,7 +1875,7 @@ cgeev.veclib : cgeev.$(SUFFIX)
|
|||
##################################### Zgeev ####################################################
|
||||
|
||||
zgeev.goto : zgeev.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
zgeev.acml : zgeev.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
@ -1880,7 +1891,7 @@ zgeev.veclib : zgeev.$(SUFFIX)
|
|||
|
||||
##################################### Sgetri ####################################################
|
||||
sgetri.goto : sgetri.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
sgetri.acml : sgetri.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
@ -1896,7 +1907,7 @@ sgetri.veclib : sgetri.$(SUFFIX)
|
|||
|
||||
##################################### Dgetri ####################################################
|
||||
dgetri.goto : dgetri.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
dgetri.acml : dgetri.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
@ -1913,7 +1924,7 @@ dgetri.veclib : dgetri.$(SUFFIX)
|
|||
##################################### Cgetri ####################################################
|
||||
|
||||
cgetri.goto : cgetri.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
cgetri.acml : cgetri.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
@ -1930,7 +1941,7 @@ cgetri.veclib : cgetri.$(SUFFIX)
|
|||
##################################### Zgetri ####################################################
|
||||
|
||||
zgetri.goto : zgetri.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
zgetri.acml : zgetri.$(SUFFIX)
|
||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
@ -2916,6 +2927,11 @@ ccholesky.$(SUFFIX) : cholesky.c
|
|||
zcholesky.$(SUFFIX) : cholesky.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
ifeq ($(BUILD_HALF),1)
|
||||
shgemm.$(SUFFIX) : gemm.c
|
||||
$(CC) $(CFLAGS) -c -DHALF -UCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
endif
|
||||
|
||||
sgemm.$(SUFFIX) : gemm.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
|
|
|
@ -39,6 +39,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#ifdef DOUBLE
|
||||
#define GEMM BLASFUNC(dgemm)
|
||||
#elif defined(HALF)
|
||||
#define GEMM BLASFUNC(shgemm)
|
||||
#else
|
||||
#define GEMM BLASFUNC(sgemm)
|
||||
#endif
|
||||
|
@ -120,7 +122,8 @@ static void *huge_malloc(BLASLONG size){
|
|||
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *a, *b, *c;
|
||||
IFLOAT *a, *b;
|
||||
FLOAT *c;
|
||||
FLOAT alpha[] = {1.0, 0.0};
|
||||
FLOAT beta [] = {0.0, 0.0};
|
||||
char transa = 'N';
|
||||
|
@ -184,10 +187,10 @@ int main(int argc, char *argv[]){
|
|||
k = to;
|
||||
}
|
||||
|
||||
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * m * k * COMPSIZE)) == NULL) {
|
||||
if (( a = (IFLOAT *)malloc(sizeof(IFLOAT) * m * k * COMPSIZE)) == NULL) {
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
if (( b = (FLOAT *)malloc(sizeof(FLOAT) * k * n * COMPSIZE)) == NULL) {
|
||||
if (( b = (IFLOAT *)malloc(sizeof(IFLOAT) * k * n * COMPSIZE)) == NULL) {
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
if (( c = (FLOAT *)malloc(sizeof(FLOAT) * m * n * COMPSIZE)) == NULL) {
|
||||
|
@ -199,10 +202,10 @@ int main(int argc, char *argv[]){
|
|||
#endif
|
||||
|
||||
for (i = 0; i < m * k * COMPSIZE; i++) {
|
||||
a[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
a[i] = ((IFLOAT) rand() / (IFLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
for (i = 0; i < k * n * COMPSIZE; i++) {
|
||||
b[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
b[i] = ((IFLOAT) rand() / (IFLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
for (i = 0; i < m * n * COMPSIZE; i++) {
|
||||
c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
|
|
|
@ -170,9 +170,11 @@ int main(int argc, char *argv[]){
|
|||
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
|
||||
#ifdef RETURN_BY_STACK
|
||||
DOT (&result , &m, x, &inc_x, y, &inc_y );
|
||||
#else
|
||||
result = DOT (&m, x, &inc_x, y, &inc_y );
|
||||
|
||||
#endif
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
|
||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
|
|
26
c_check
26
c_check
|
@ -6,6 +6,7 @@
|
|||
# Checking cross compile
|
||||
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
|
||||
$hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch);
|
||||
$hostarch = `uname -p` if ($hostos eq "AIX");
|
||||
$hostarch = "x86_64" if ($hostarch eq "amd64");
|
||||
$hostarch = "arm" if ($hostarch =~ /^arm.*/);
|
||||
$hostarch = "arm64" if ($hostarch eq "aarch64");
|
||||
|
@ -248,6 +249,28 @@ if (($architecture eq "x86") || ($architecture eq "x86_64")) {
|
|||
}
|
||||
}
|
||||
|
||||
$c11_atomics = 0;
|
||||
if ($data =~ /HAVE_C11/) {
|
||||
eval "use File::Temp qw(tempfile)";
|
||||
if ($@){
|
||||
warn "could not load PERL module File::Temp, so could not check compiler compatibility with C11";
|
||||
$c11_atomics = 0;
|
||||
} else {
|
||||
($fh,$tmpf) = tempfile( SUFFIX => '.c' , UNLINK => 1 );
|
||||
print $tmpf "#include <stdatomic.h>\nint main(void){}\n";
|
||||
$args = " -c -o $tmpf.o $tmpf";
|
||||
my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
|
||||
system(@cmd) == 0;
|
||||
if ($? != 0) {
|
||||
$c11_atomics = 0;
|
||||
} else {
|
||||
$c11_atomics = 1;
|
||||
}
|
||||
unlink("$tmpf.o");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
$data = `$compiler_name $flags -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`;
|
||||
|
||||
$data =~ /globl\s([_\.]*)(.*)/;
|
||||
|
@ -310,6 +333,7 @@ $linker_a = "";
|
|||
&& ($flags !~ /advapi32/)
|
||||
&& ($flags !~ /shell32/)
|
||||
&& ($flags !~ /omp/)
|
||||
&& ($flags !~ /[0-9]+/)
|
||||
) {
|
||||
$linker_l .= $flags . " "
|
||||
}
|
||||
|
@ -350,6 +374,8 @@ print CONFFILE "#define __32BIT__\t1\n" if $binformat eq bin32;
|
|||
print CONFFILE "#define __64BIT__\t1\n" if $binformat eq bin64;
|
||||
print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne "";
|
||||
print CONFFILE "#define HAVE_MSA\t1\n" if $have_msa eq 1;
|
||||
print CONFFILE "#define HAVE_C11\t1\n" if $c11_atomics eq 1;
|
||||
|
||||
|
||||
if ($os eq "LINUX") {
|
||||
|
||||
|
|
|
@ -45,11 +45,11 @@ endif ()
|
|||
|
||||
if (DYNAMIC_ARCH)
|
||||
if (ARM64)
|
||||
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1)
|
||||
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
|
||||
endif ()
|
||||
|
||||
if (POWER)
|
||||
set(DYNAMIC_CORE POWER6 POWER8 POWER9)
|
||||
set(DYNAMIC_CORE POWER6 POWER8 POWER9 POWER10)
|
||||
endif ()
|
||||
|
||||
if (X86)
|
||||
|
@ -76,9 +76,9 @@ if (DYNAMIC_ARCH)
|
|||
set(DYNAMIC_CORE ${DYNAMIC_CORE} HASWELL ZEN)
|
||||
endif ()
|
||||
if (NOT NO_AVX512)
|
||||
set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX)
|
||||
set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX COOPERLAKE)
|
||||
string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
|
||||
endif ()
|
||||
endif ()
|
||||
if (DYNAMIC_LIST)
|
||||
set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST})
|
||||
endif ()
|
||||
|
|
|
@ -103,3 +103,16 @@ if (${CORE} STREQUAL "SKYLAKEX")
|
|||
endif ()
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CORE} STREQUAL "COOPERLAKE")
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
if (NOT NO_AVX512)
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1)
|
||||
set (CCOMMON_OPT = "${CCOMMON_OPT} -march=cooperlake")
|
||||
else ()
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=skylake-avx512")
|
||||
endif()
|
||||
endif ()
|
||||
endif ()
|
||||
endif ()
|
||||
|
|
|
@ -21,7 +21,15 @@
|
|||
# NEED2UNDERSCORES
|
||||
|
||||
if (NOT NO_LAPACK)
|
||||
enable_language(Fortran)
|
||||
include(CheckLanguage)
|
||||
check_language(Fortran)
|
||||
if(CMAKE_Fortran_COMPILER)
|
||||
enable_language(Fortran)
|
||||
else()
|
||||
message(STATUS "No Fortran compiler found, can build only BLAS but not LAPACK")
|
||||
set (NOFORTRAN 1)
|
||||
set (NO_LAPACK 1)
|
||||
endif()
|
||||
else()
|
||||
include(CMakeForceCompiler)
|
||||
CMAKE_FORCE_Fortran_COMPILER(gfortran GNU)
|
||||
|
|
|
@ -16,6 +16,7 @@ if (${F_COMPILER} STREQUAL "FLANG")
|
|||
if (USE_OPENMP)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp")
|
||||
endif ()
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -Mrecursive -Kieee")
|
||||
endif ()
|
||||
|
||||
if (${F_COMPILER} STREQUAL "G77")
|
||||
|
|
|
@ -113,6 +113,7 @@ macro(SetDefaultL1)
|
|||
set(ZSUMKERNEL zsum.S)
|
||||
set(QSUMKERNEL sum.S)
|
||||
set(XSUMKERNEL zsum.S)
|
||||
if (BUILD_HALF)
|
||||
set(SHAMINKERNEL ../arm/amin.c)
|
||||
set(SHAMAXKERNEL ../arm/amax.c)
|
||||
set(SHMAXKERNEL ../arm/max.c)
|
||||
|
@ -131,6 +132,7 @@ macro(SetDefaultL1)
|
|||
set(SHNRM2KERNEL ../arm/nrm2.c)
|
||||
set(SHSUMKERNEL ../arm/sum.c)
|
||||
set(SHSWAPKERNEL ../arm/swap.c)
|
||||
endif ()
|
||||
endmacro ()
|
||||
|
||||
macro(SetDefaultL2)
|
||||
|
@ -179,10 +181,11 @@ macro(SetDefaultL2)
|
|||
set(XHEMV_L_KERNEL ../generic/zhemv_k.c)
|
||||
set(XHEMV_V_KERNEL ../generic/zhemv_k.c)
|
||||
set(XHEMV_M_KERNEL ../generic/zhemv_k.c)
|
||||
if (BUILD_HALF)
|
||||
set(SHGEMVNKERNEL ../arm/gemv_n.c)
|
||||
set(SHGEMVTKERNEL ../arm/gemv_t.c)
|
||||
set(SHGERKERNEL ../generic/ger.c)
|
||||
|
||||
endif ()
|
||||
endmacro ()
|
||||
|
||||
macro(SetDefaultL3)
|
||||
|
@ -190,6 +193,7 @@ macro(SetDefaultL3)
|
|||
set(DGEADD_KERNEL ../generic/geadd.c)
|
||||
set(CGEADD_KERNEL ../generic/zgeadd.c)
|
||||
set(ZGEADD_KERNEL ../generic/zgeadd.c)
|
||||
if (BUILD_HALF)
|
||||
set(SHGEADD_KERNEL ../generic/geadd.c)
|
||||
set(SHGEMMKERNEL ../generic/gemmkernel_2x2.c)
|
||||
set(SHGEMM_BETA ../generic/gemm_beta.c)
|
||||
|
@ -201,6 +205,6 @@ macro(SetDefaultL3)
|
|||
set(SHGEMMITCOPYOBJ shgemm_itcopy.o)
|
||||
set(SHGEMMONCOPYOBJ shgemm_oncopy.o)
|
||||
set(SHGEMMOTCOPYOBJ shgemm_otcopy.o)
|
||||
|
||||
endif ()
|
||||
|
||||
endmacro ()
|
||||
|
|
|
@ -7,5 +7,5 @@ Name: OpenBLAS
|
|||
Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
|
||||
Version: @OPENBLAS_VERSION@
|
||||
URL: https://github.com/xianyi/OpenBLAS
|
||||
Libs: -L${libdir} -lopenblas${libsuffix}
|
||||
Libs: @OpenMP_C_FLAGS@ -L${libdir} -lopenblas${libsuffix}
|
||||
Cflags: -I${includedir}
|
||||
|
|
|
@ -8,7 +8,7 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
|
|||
set(NO_EXPRECISION 1)
|
||||
endif ()
|
||||
|
||||
if (${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD|OpenBSD|NetBSD|DragonFly")
|
||||
if (${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD|OpenBSD|NetBSD|DragonFly|Darwin")
|
||||
set(EXTRALIB "${EXTRALIB} -lm")
|
||||
set(NO_EXPRECISION 1)
|
||||
endif ()
|
||||
|
|
|
@ -195,8 +195,13 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
|
|||
"#define HAVE_VFP\n"
|
||||
"#define HAVE_NEON\n"
|
||||
"#define ARMV8\n")
|
||||
if ("${TCORE}" STREQUAL "CORTEXA57")
|
||||
set(SGEMM_UNROLL_M 16)
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
else ()
|
||||
set(SGEMM_UNROLL_M 8)
|
||||
set(SGEMM_UNROLL_N 8)
|
||||
endif ()
|
||||
set(DGEMM_UNROLL_M 8)
|
||||
set(DGEMM_UNROLL_N 4)
|
||||
set(CGEMM_UNROLL_M 8)
|
||||
|
@ -338,6 +343,33 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
|
|||
set(ZGEMM_UNROLL_M 4)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
set(SYMV_P 16)
|
||||
elseif ("${TCORE}" STREQUAL "THUNDERX3T110")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define THUNDERX3T110\n"
|
||||
"#define L1_CODE_SIZE\t65536\n"
|
||||
"#define L1_CODE_LINESIZE\t64\n"
|
||||
"#define L1_CODE_ASSOCIATIVE\t8\n"
|
||||
"#define L1_DATA_SIZE\t65536\n"
|
||||
"#define L1_DATA_LINESIZE\t64\n"
|
||||
"#define L1_DATA_ASSOCIATIVE\t8\n"
|
||||
"#define L2_SIZE\t524288\n"
|
||||
"#define L2_LINESIZE\t64\n"
|
||||
"#define L2_ASSOCIATIVE\t8\n"
|
||||
"#define L3_SIZE\t94371840\n"
|
||||
"#define L3_LINESIZE\t64\n"
|
||||
"#define L3_ASSOCIATIVE\t32\n"
|
||||
"#define DTB_DEFAULT_ENTRIES\t64\n"
|
||||
"#define DTB_SIZE\t4096\n"
|
||||
"#define ARMV8\n")
|
||||
set(SGEMM_UNROLL_M 16)
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
set(DGEMM_UNROLL_M 8)
|
||||
set(DGEMM_UNROLL_N 4)
|
||||
set(CGEMM_UNROLL_M 8)
|
||||
set(CGEMM_UNROLL_N 4)
|
||||
set(ZGEMM_UNROLL_M 4)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
set(SYMV_P 16)
|
||||
elseif ("${TCORE}" STREQUAL "TSV110")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define ARMV8\n"
|
||||
|
@ -420,7 +452,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
|
|||
set(ZGEMM_UNROLL_M 8)
|
||||
set(ZGEMM_UNROLL_N 2)
|
||||
set(SYMV_P 8)
|
||||
elseif ("${TCORE}" STREQUAL "POWER9")
|
||||
elseif ("${TCORE}" STREQUAL "POWER9" OR "${TCORE}" STREQUAL "POWER10")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE 32768\n"
|
||||
"#define L1_DATA_LINESIZE 128\n"
|
||||
|
@ -492,7 +524,7 @@ else(NOT CMAKE_CROSSCOMPILING)
|
|||
if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
|
||||
try_compile(GETARCH_RESULT ${GETARCH_DIR}
|
||||
SOURCES ${GETARCH_SRC}
|
||||
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${GETARCH_DIR} -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}"
|
||||
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I"${GETARCH_DIR}" -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}"
|
||||
OUTPUT_VARIABLE GETARCH_LOG
|
||||
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN}
|
||||
)
|
||||
|
@ -520,7 +552,7 @@ execute_process(COMMAND "${PROJECT_BINARY_DIR}/${GETARCH_BIN}" 1 OUTPUT_VARIABLE
|
|||
if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
|
||||
try_compile(GETARCH2_RESULT ${GETARCH2_DIR}
|
||||
SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c
|
||||
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${GETARCH2_DIR} -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}"
|
||||
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I"${GETARCH2_DIR}" -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}"
|
||||
OUTPUT_VARIABLE GETARCH2_LOG
|
||||
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN}
|
||||
)
|
||||
|
|
|
@ -33,7 +33,7 @@ endif ()
|
|||
if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
|
||||
message(STATUS "Compiling a ${BINARY}-bit binary.")
|
||||
set(NO_AVX 1)
|
||||
if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX")
|
||||
if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX" OR ${TARGET} STREQUAL "COOPERLAKE")
|
||||
set(TARGET "NEHALEM")
|
||||
endif ()
|
||||
if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN")
|
||||
|
@ -45,6 +45,18 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
|
|||
endif ()
|
||||
|
||||
if (DEFINED TARGET)
|
||||
if (${TARGET} STREQUAL "COOPERLAKE" AND NOT NO_AVX512)
|
||||
# if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake")
|
||||
else()
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
|
||||
endif()
|
||||
# elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
|
||||
# set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
|
||||
# endif()
|
||||
endif()
|
||||
if (${TARGET} STREQUAL "SKYLAKEX" AND NOT NO_AVX512)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
|
||||
endif()
|
||||
|
@ -297,6 +309,16 @@ if (USE_SIMPLE_THREADED_LEVEL3)
|
|||
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_SIMPLE_THREADED_LEVEL3")
|
||||
endif ()
|
||||
|
||||
if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
|
||||
if (DEFINED MAX_STACK_ALLOC)
|
||||
if (NOT ${MAX_STACK_ALLOC} EQUAL 0)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_STACK_ALLOC=${MAX_STACK_ALLOC}")
|
||||
endif ()
|
||||
else ()
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_STACK_ALLOC=2048")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (DEFINED LIBNAMESUFFIX)
|
||||
set(LIBPREFIX "libopenblas_${LIBNAMESUFFIX}")
|
||||
else ()
|
||||
|
@ -407,6 +429,14 @@ if (${CMAKE_C_COMPILER} STREQUAL "LSB" OR ${CMAKE_SYSTEM_NAME} STREQUAL "Windows
|
|||
set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DLAPACK_COMPLEX_STRUCTURE")
|
||||
endif ()
|
||||
|
||||
if ("${CMAKE_BUILD_TYPE}" STREQUAL "Release")
|
||||
if ("${F_COMPILER}" STREQUAL "FLANG")
|
||||
if (${CMAKE_Fortran_COMPILER_VERSION} VERSION_LESS_EQUAL 3)
|
||||
set(CMAKE_Fortran_FLAGS_RELEASE "${CMAKE_Fortran_FLAGS_RELEASE} -fno-unroll-loops")
|
||||
endif ()
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (NOT DEFINED SUFFIX)
|
||||
set(SUFFIX o)
|
||||
endif ()
|
||||
|
|
|
@ -116,3 +116,10 @@ set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512")
|
|||
endif()
|
||||
file(REMOVE "avx512.c" "avx512.o")
|
||||
endif()
|
||||
|
||||
include(CheckIncludeFile)
|
||||
CHECK_INCLUDE_FILE("stdatomic.h" HAVE_C11)
|
||||
if (HAVE_C11 EQUAL 1)
|
||||
message (STATUS found stdatomic.h)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -DHAVE_C11")
|
||||
endif()
|
||||
|
|
|
@ -15,12 +15,36 @@ endfunction ()
|
|||
# Reads a Makefile into CMake vars.
|
||||
macro(ParseMakefileVars MAKEFILE_IN)
|
||||
message(STATUS "Reading vars from ${MAKEFILE_IN}...")
|
||||
set (IfElse 0)
|
||||
set (ElseSeen 0)
|
||||
file(STRINGS ${MAKEFILE_IN} makefile_contents)
|
||||
foreach (makefile_line ${makefile_contents})
|
||||
#message(STATUS "parsing ${makefile_line}")
|
||||
if (${IfElse} GREATER 0)
|
||||
string(REGEX MATCH "endif[ \t]*" line_match "${makefile_line}")
|
||||
if (NOT "${line_match}" STREQUAL "")
|
||||
# message(STATUS "ENDIF ${makefile_line}")
|
||||
set (IfElse 0)
|
||||
set (ElseSeen 0)
|
||||
continue ()
|
||||
endif ()
|
||||
string(REGEX MATCH "else[ \t]*" line_match "${makefile_line}")
|
||||
if (NOT "${line_match}" STREQUAL "")
|
||||
# message(STATUS "ELSE ${makefile_line}")
|
||||
set (ElseSeen 1)
|
||||
continue ()
|
||||
endif()
|
||||
if ( (${IfElse} EQUAL 2 AND ${ElseSeen} EQUAL 0) OR ( ${IfElse} EQUAL 1 AND ${ElseSeen} EQUAL 1))
|
||||
# message(STATUS "skipping ${makefile_line}")
|
||||
continue ()
|
||||
endif ()
|
||||
endif ()
|
||||
string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*(.+)$" line_match "${makefile_line}")
|
||||
if (NOT "${line_match}" STREQUAL "")
|
||||
#message(STATUS "match on ${line_match}")
|
||||
set(var_name ${CMAKE_MATCH_1})
|
||||
set(var_value ${CMAKE_MATCH_2})
|
||||
# set(var_value ${CMAKE_MATCH_2})
|
||||
string(STRIP ${CMAKE_MATCH_2} var_value)
|
||||
# check for Makefile variables in the string, e.g. $(TSUFFIX)
|
||||
string(REGEX MATCHALL "\\$\\(([0-9_a-zA-Z]+)\\)" make_var_matches ${var_value})
|
||||
foreach (make_var ${make_var_matches})
|
||||
|
@ -33,7 +57,31 @@ macro(ParseMakefileVars MAKEFILE_IN)
|
|||
else ()
|
||||
string(REGEX MATCH "include \\$\\(KERNELDIR\\)/(.+)$" line_match "${makefile_line}")
|
||||
if (NOT "${line_match}" STREQUAL "")
|
||||
#message(STATUS "match on include ${line_match}")
|
||||
ParseMakefileVars(${KERNELDIR}/${CMAKE_MATCH_1})
|
||||
else ()
|
||||
# message(STATUS "unmatched line ${line_match}")
|
||||
string(REGEX MATCH "ifeq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}")
|
||||
if (NOT "${line_match}" STREQUAL "")
|
||||
# message(STATUS "IFEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}")
|
||||
if (DEFINED ${${CMAKE_MATCH_1}} AND ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2})
|
||||
# message (STATUS "condition is true")
|
||||
set (IfElse 1)
|
||||
else ()
|
||||
set (IfElse 2)
|
||||
endif ()
|
||||
else ()
|
||||
string(REGEX MATCH "ifneq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}")
|
||||
if (NOT "${line_match}" STREQUAL "")
|
||||
# message(STATUS "IFNEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}")
|
||||
if (NOT ( ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2}))
|
||||
# message (STATUS "condition is true")
|
||||
set (IfElse 1)
|
||||
else ()
|
||||
set (IfElse 2)
|
||||
endif ()
|
||||
endif ()
|
||||
endif ()
|
||||
endif ()
|
||||
endif ()
|
||||
endforeach ()
|
||||
|
|
9
common.h
9
common.h
|
@ -360,13 +360,8 @@ typedef int blasint;
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef POWER8
|
||||
#ifndef YIELDING
|
||||
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef POWER9
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#ifndef YIELDING
|
||||
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
|
||||
#endif
|
||||
|
@ -686,7 +681,7 @@ __declspec(dllimport) int __cdecl omp_in_parallel(void);
|
|||
__declspec(dllimport) int __cdecl omp_get_num_procs(void);
|
||||
#endif
|
||||
|
||||
#if (__STDC_VERSION__ >= 201112L)
|
||||
#ifdef HAVE_C11
|
||||
#if defined(C_GCC) && ( __GNUC__ < 7)
|
||||
// workaround for GCC bug 65467
|
||||
#ifndef _Atomic
|
||||
|
|
|
@ -47,12 +47,12 @@ __global__ void cuda_dgemm_kernel(int, int, int, double *, double *, double *);
|
|||
extern "C" {
|
||||
#endif
|
||||
|
||||
extern void sgemm_kernel_direct(BLASLONG M, BLASLONG N, BLASLONG K,
|
||||
void sgemm_direct(BLASLONG M, BLASLONG N, BLASLONG K,
|
||||
float * A, BLASLONG strideA,
|
||||
float * B, BLASLONG strideB,
|
||||
float * R, BLASLONG strideR);
|
||||
|
||||
extern int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K);
|
||||
int sgemm_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K);
|
||||
|
||||
|
||||
int shgemm_beta(BLASLONG, BLASLONG, BLASLONG, float,
|
||||
|
|
|
@ -94,7 +94,7 @@ REALNAME:
|
|||
#endif
|
||||
#define HUGE_PAGESIZE ( 4 << 20)
|
||||
|
||||
#define BUFFER_SIZE (16 << 20)
|
||||
#define BUFFER_SIZE (16 << 21)
|
||||
|
||||
|
||||
#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER)
|
||||
|
|
|
@ -227,7 +227,7 @@ REALNAME: ;\
|
|||
|
||||
#define SEEK_ADDRESS
|
||||
|
||||
#define BUFFER_SIZE ( 32 << 20)
|
||||
#define BUFFER_SIZE ( 32 << 21)
|
||||
|
||||
#if defined(LOONGSON3A)
|
||||
#define PAGESIZE (16UL << 10)
|
||||
|
|
|
@ -47,7 +47,7 @@ typedef struct {
|
|||
int dtb_entries;
|
||||
int offsetA, offsetB, align;
|
||||
|
||||
#if 1
|
||||
#ifdef BUILD_HALF
|
||||
int shgemm_p, shgemm_q, shgemm_r;
|
||||
int shgemm_unroll_m, shgemm_unroll_n, shgemm_unroll_mn;
|
||||
|
||||
|
@ -175,6 +175,11 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
|
|||
int (*ssymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
|
||||
int (*ssymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
|
||||
|
||||
#ifdef ARCH_X86_64
|
||||
void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG);
|
||||
int (*sgemm_direct_performant) (BLASLONG M, BLASLONG N, BLASLONG K);
|
||||
#endif
|
||||
|
||||
int (*sgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG);
|
||||
int (*sgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
|
||||
|
@ -1002,12 +1007,14 @@ extern gotoblas_t *gotoblas;
|
|||
|
||||
#define HAVE_EX_L2 gotoblas -> exclusive_cache
|
||||
|
||||
#ifdef BUILD_HALF
|
||||
#define SHGEMM_P gotoblas -> shgemm_p
|
||||
#define SHGEMM_Q gotoblas -> shgemm_q
|
||||
#define SHGEMM_R gotoblas -> shgemm_r
|
||||
#define SHGEMM_UNROLL_M gotoblas -> shgemm_unroll_m
|
||||
#define SHGEMM_UNROLL_N gotoblas -> shgemm_unroll_n
|
||||
#define SHGEMM_UNROLL_MN gotoblas -> shgemm_unroll_mn
|
||||
#endif
|
||||
|
||||
#define SGEMM_P gotoblas -> sgemm_p
|
||||
#define SGEMM_Q gotoblas -> sgemm_q
|
||||
|
@ -1086,6 +1093,7 @@ extern gotoblas_t *gotoblas;
|
|||
#define HAVE_EX_L2 0
|
||||
#endif
|
||||
|
||||
#ifdef BUILD_HALF
|
||||
#define SHGEMM_P SHGEMM_DEFAULT_P
|
||||
#define SHGEMM_Q SHGEMM_DEFAULT_Q
|
||||
#define SHGEMM_R SHGEMM_DEFAULT_R
|
||||
|
@ -1096,6 +1104,7 @@ extern gotoblas_t *gotoblas;
|
|||
#else
|
||||
#define SHGEMM_UNROLL_MN MAX((SHGEMM_UNROLL_M), (SHGEMM_UNROLL_N))
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define SGEMM_P SGEMM_DEFAULT_P
|
||||
#define SGEMM_Q SGEMM_DEFAULT_Q
|
||||
|
@ -1330,31 +1339,31 @@ extern gotoblas_t *gotoblas;
|
|||
#endif
|
||||
|
||||
#ifndef SHGEMM_DEFAULT_R
|
||||
#define SHGEMM_DEFAULT_R (((BUFFER_SIZE - ((SHGEMM_DEFAULT_P * SHGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SHGEMM_DEFAULT_Q * 4) - 15) & ~15)
|
||||
#define SHGEMM_DEFAULT_R (((BUFFER_SIZE - ((SHGEMM_DEFAULT_P * SHGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SHGEMM_DEFAULT_Q * 4) - 15) & ~15UL)
|
||||
#endif
|
||||
|
||||
#ifndef SGEMM_DEFAULT_R
|
||||
#define SGEMM_DEFAULT_R (((BUFFER_SIZE - ((SGEMM_DEFAULT_P * SGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SGEMM_DEFAULT_Q * 4) - 15) & ~15)
|
||||
#define SGEMM_DEFAULT_R (((BUFFER_SIZE - ((SGEMM_DEFAULT_P * SGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SGEMM_DEFAULT_Q * 4) - 15) & ~15UL)
|
||||
#endif
|
||||
|
||||
#ifndef DGEMM_DEFAULT_R
|
||||
#define DGEMM_DEFAULT_R (((BUFFER_SIZE - ((DGEMM_DEFAULT_P * DGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (DGEMM_DEFAULT_Q * 8) - 15) & ~15)
|
||||
#define DGEMM_DEFAULT_R (((BUFFER_SIZE - ((DGEMM_DEFAULT_P * DGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (DGEMM_DEFAULT_Q * 8) - 15) & ~15UL)
|
||||
#endif
|
||||
|
||||
#ifndef QGEMM_DEFAULT_R
|
||||
#define QGEMM_DEFAULT_R (((BUFFER_SIZE - ((QGEMM_DEFAULT_P * QGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (QGEMM_DEFAULT_Q * 16) - 15) & ~15)
|
||||
#define QGEMM_DEFAULT_R (((BUFFER_SIZE - ((QGEMM_DEFAULT_P * QGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (QGEMM_DEFAULT_Q * 16) - 15) & ~15UL)
|
||||
#endif
|
||||
|
||||
#ifndef CGEMM_DEFAULT_R
|
||||
#define CGEMM_DEFAULT_R (((BUFFER_SIZE - ((CGEMM_DEFAULT_P * CGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (CGEMM_DEFAULT_Q * 8) - 15) & ~15)
|
||||
#define CGEMM_DEFAULT_R (((BUFFER_SIZE - ((CGEMM_DEFAULT_P * CGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (CGEMM_DEFAULT_Q * 8) - 15) & ~15UL)
|
||||
#endif
|
||||
|
||||
#ifndef ZGEMM_DEFAULT_R
|
||||
#define ZGEMM_DEFAULT_R (((BUFFER_SIZE - ((ZGEMM_DEFAULT_P * ZGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (ZGEMM_DEFAULT_Q * 16) - 15) & ~15)
|
||||
#define ZGEMM_DEFAULT_R (((BUFFER_SIZE - ((ZGEMM_DEFAULT_P * ZGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (ZGEMM_DEFAULT_Q * 16) - 15) & ~15UL)
|
||||
#endif
|
||||
|
||||
#ifndef XGEMM_DEFAULT_R
|
||||
#define XGEMM_DEFAULT_R (((BUFFER_SIZE - ((XGEMM_DEFAULT_P * XGEMM_DEFAULT_Q * 32 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (XGEMM_DEFAULT_Q * 32) - 15) & ~15)
|
||||
#define XGEMM_DEFAULT_R (((BUFFER_SIZE - ((XGEMM_DEFAULT_P * XGEMM_DEFAULT_Q * 32 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (XGEMM_DEFAULT_Q * 32) - 15) & ~15UL)
|
||||
#endif
|
||||
|
||||
#ifndef SNUMOPT
|
||||
|
|
|
@ -68,7 +68,7 @@
|
|||
#endif
|
||||
|
||||
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#define MB __asm__ __volatile__ ("eieio":::"memory")
|
||||
#define WMB __asm__ __volatile__ ("eieio":::"memory")
|
||||
#define RMB __asm__ __volatile__ ("eieio":::"memory")
|
||||
|
@ -105,6 +105,7 @@ static void INLINE blas_lock(volatile unsigned long *address){
|
|||
" bne- 1f\n"
|
||||
" stwcx. %2,0, %1\n"
|
||||
" bne- 0b\n"
|
||||
" isync\n"
|
||||
"1: "
|
||||
: "=&r"(ret)
|
||||
: "r"(address), "r" (val)
|
||||
|
@ -272,7 +273,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
|||
#define HAVE_PREFETCH
|
||||
#endif
|
||||
|
||||
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || defined(PPC970)
|
||||
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || defined(POWER10) || defined(PPC970)
|
||||
#define DCBT_ARG 0
|
||||
#else
|
||||
#define DCBT_ARG 8
|
||||
|
@ -294,7 +295,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
|||
#define L1_PREFETCH dcbtst
|
||||
#endif
|
||||
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#define L1_DUALFETCH
|
||||
#define L1_PREFETCHSIZE (16 + 128 * 100)
|
||||
#define L1_PREFETCH dcbtst
|
||||
|
@ -843,7 +844,7 @@ Lmcount$lazy_ptr:
|
|||
#define BUFFER_SIZE ( 2 << 20)
|
||||
#elif defined(PPC440FP2)
|
||||
#define BUFFER_SIZE ( 16 << 20)
|
||||
#elif defined(POWER8) || defined(POWER9)
|
||||
#elif defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#define BUFFER_SIZE ( 64 << 20)
|
||||
#else
|
||||
#define BUFFER_SIZE ( 16 << 20)
|
||||
|
|
12
common_s.h
12
common_s.h
|
@ -45,6 +45,10 @@
|
|||
#define SSYMV_THREAD_U ssymv_thread_U
|
||||
#define SSYMV_THREAD_L ssymv_thread_L
|
||||
|
||||
|
||||
#define SGEMM_DIRECT_PERFORMANT sgemm_direct_performant
|
||||
#define SGEMM_DIRECT sgemm_direct
|
||||
|
||||
#define SGEMM_ONCOPY sgemm_oncopy
|
||||
#define SGEMM_OTCOPY sgemm_otcopy
|
||||
|
||||
|
@ -214,6 +218,14 @@
|
|||
#define SSYMV_THREAD_U ssymv_thread_U
|
||||
#define SSYMV_THREAD_L ssymv_thread_L
|
||||
|
||||
#ifdef ARCH_X86_64
|
||||
#define SGEMM_DIRECT_PERFORMANT gotoblas -> sgemm_direct_performant
|
||||
#define SGEMM_DIRECT gotoblas -> sgemm_direct
|
||||
#else
|
||||
#define SGEMM_DIRECT_PERFORMANT sgemm_direct_performant
|
||||
#define SGEMM_DIRECT sgemm_direct
|
||||
#endif
|
||||
|
||||
#define SGEMM_ONCOPY gotoblas -> sgemm_oncopy
|
||||
#define SGEMM_OTCOPY gotoblas -> sgemm_otcopy
|
||||
#define SGEMM_INCOPY gotoblas -> sgemm_incopy
|
||||
|
|
|
@ -132,18 +132,18 @@ extern int blas_server_avail;
|
|||
static __inline int num_cpu_avail(int level) {
|
||||
|
||||
#ifdef USE_OPENMP
|
||||
int openmp_nthreads=0;
|
||||
int openmp_nthreads=omp_get_max_threads();
|
||||
#endif
|
||||
|
||||
#ifndef USE_OPENMP
|
||||
if (blas_cpu_number == 1
|
||||
|
||||
#endif
|
||||
#ifdef USE_OPENMP
|
||||
|| omp_in_parallel()
|
||||
if (openmp_nthreads == 1 || omp_in_parallel()
|
||||
#endif
|
||||
) return 1;
|
||||
|
||||
#ifdef USE_OPENMP
|
||||
openmp_nthreads=omp_get_max_threads();
|
||||
if (blas_cpu_number != openmp_nthreads) {
|
||||
goto_set_num_threads(openmp_nthreads);
|
||||
}
|
||||
|
|
|
@ -80,7 +80,7 @@ static void __inline blas_lock(volatile BLASULONG *address){
|
|||
#endif
|
||||
|
||||
do {
|
||||
while (*address) {YIELDING;};
|
||||
while (*address) {YIELDING;}
|
||||
|
||||
#ifndef C_MSVC
|
||||
__asm__ __volatile__(
|
||||
|
@ -199,9 +199,9 @@ static __inline BLASLONG blas_quickdivide(BLASLONG x, BLASLONG y){
|
|||
#else
|
||||
extern unsigned int blas_quick_divide_table[];
|
||||
|
||||
static __inline int blas_quickdivide(unsigned int x, unsigned int y){
|
||||
static __inline unsigned int blas_quickdivide(unsigned int x, unsigned int y){
|
||||
|
||||
unsigned int result;
|
||||
volatile unsigned int result;
|
||||
|
||||
if (y <= 1) return x;
|
||||
|
||||
|
@ -215,7 +215,6 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
|
|||
y = blas_quick_divide_table[y];
|
||||
|
||||
__asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x) : "0" (y));
|
||||
|
||||
return result;
|
||||
}
|
||||
#endif
|
||||
|
@ -229,14 +228,8 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
|
|||
#define HUGE_PAGESIZE ( 2 << 20)
|
||||
|
||||
#ifndef BUFFERSIZE
|
||||
#if defined(SKYLAKEX)
|
||||
#define BUFFER_SIZE (32 << 21)
|
||||
#elif defined(HASWELL) || defined(ZEN)
|
||||
#define BUFFER_SIZE (32 << 22)
|
||||
#else
|
||||
#define BUFFER_SIZE (32 << 20)
|
||||
#endif
|
||||
#else
|
||||
#define BUFFER_SIZE (32 << BUFFERSIZE)
|
||||
#endif
|
||||
|
||||
|
|
|
@ -5,6 +5,14 @@ inline void pauser(){
|
|||
std::getline(std::cin, dummy);
|
||||
}
|
||||
|
||||
void FailIfThreadsAreZero(uint32_t numConcurrentThreads) {
|
||||
if(numConcurrentThreads == 0) {
|
||||
std::cout<<"ERROR: Invalid parameter 0 for number of concurrent calls into OpenBLAS!"<<std::endl;
|
||||
std::cout<<"CBLAS DGEMV thread safety test FAILED!"<<std::endl;
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
void FillMatrices(std::vector<std::vector<double>>& matBlock, std::mt19937_64& PRNG, std::uniform_real_distribution<double>& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){
|
||||
for(uint32_t i=0; i<numMat; i++){
|
||||
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize*randomMatSize); j++){
|
||||
|
|
|
@ -47,6 +47,8 @@ int main(int argc, char* argv[]){
|
|||
std::cout<<"Number of testing rounds : "<<numTestRounds<<'\n';
|
||||
std::cout<<"This test will need "<<(static_cast<uint64_t>(randomMatSize*randomMatSize)*numConcurrentThreads*3*8)/static_cast<double>(1024*1024)<<" MiB of RAM\n"<<std::endl;
|
||||
|
||||
FailIfThreadsAreZero(numConcurrentThreads);
|
||||
|
||||
std::cout<<"Initializing random number generator..."<<std::flush;
|
||||
std::mt19937_64 PRNG = InitPRNG();
|
||||
std::cout<<"done\n";
|
||||
|
|
|
@ -18,7 +18,7 @@ int main(int argc, char* argv[]){
|
|||
uint32_t maxHwThreads = omp_get_max_threads();
|
||||
|
||||
if (maxHwThreads < 52)
|
||||
numConcurrentThreads = maxHwThreads -4;
|
||||
numConcurrentThreads = maxHwThreads;
|
||||
|
||||
if (argc > 4){
|
||||
std::cout<<"ERROR: too many arguments for thread safety tester"<<std::endl;
|
||||
|
@ -48,6 +48,8 @@ int main(int argc, char* argv[]){
|
|||
std::cout<<"Number of testing rounds : "<<numTestRounds<<'\n';
|
||||
std::cout<<"This test will need "<<((static_cast<uint64_t>(randomMatSize*randomMatSize)*numConcurrentThreads*8)+(static_cast<uint64_t>(randomMatSize)*numConcurrentThreads*8*2))/static_cast<double>(1024*1024)<<" MiB of RAM\n"<<std::endl;
|
||||
|
||||
FailIfThreadsAreZero(numConcurrentThreads);
|
||||
|
||||
std::cout<<"Initializing random number generator..."<<std::flush;
|
||||
std::mt19937_64 PRNG = InitPRNG();
|
||||
std::cout<<"done\n";
|
||||
|
|
15
cpuid.h
15
cpuid.h
|
@ -118,6 +118,7 @@
|
|||
#define CORE_ZEN 27
|
||||
#define CORE_SKYLAKEX 28
|
||||
#define CORE_DHYANA 29
|
||||
#define CORE_COOPERLAKE 30
|
||||
|
||||
#define HAVE_SSE (1 << 0)
|
||||
#define HAVE_SSE2 (1 << 1)
|
||||
|
@ -137,11 +138,12 @@
|
|||
#define HAVE_MISALIGNSSE (1 << 15)
|
||||
#define HAVE_128BITFPU (1 << 16)
|
||||
#define HAVE_FASTMOVU (1 << 17)
|
||||
#define HAVE_AVX (1 << 18)
|
||||
#define HAVE_FMA4 (1 << 19)
|
||||
#define HAVE_FMA3 (1 << 20)
|
||||
#define HAVE_AVX512VL (1 << 21)
|
||||
#define HAVE_AVX2 (1 << 22)
|
||||
#define HAVE_AVX (1 << 18)
|
||||
#define HAVE_FMA4 (1 << 19)
|
||||
#define HAVE_FMA3 (1 << 20)
|
||||
#define HAVE_AVX512VL (1 << 21)
|
||||
#define HAVE_AVX2 (1 << 22)
|
||||
#define HAVE_AVX512BF16 (1 << 23)
|
||||
|
||||
#define CACHE_INFO_L1_I 1
|
||||
#define CACHE_INFO_L1_D 2
|
||||
|
@ -218,7 +220,8 @@ typedef struct {
|
|||
#define CPUTYPE_ZEN 51
|
||||
#define CPUTYPE_SKYLAKEX 52
|
||||
#define CPUTYPE_DHYANA 53
|
||||
#define CPUTYPE_COOPERLAKE 54
|
||||
|
||||
#define CPUTYPE_HYGON_UNKNOWN 54
|
||||
#define CPUTYPE_HYGON_UNKNOWN 99
|
||||
|
||||
#endif
|
||||
|
|
|
@ -40,6 +40,7 @@
|
|||
// Cavium
|
||||
#define CPU_THUNDERX 7
|
||||
#define CPU_THUNDERX2T99 8
|
||||
#define CPU_THUNDERX3T110 12
|
||||
//Hisilicon
|
||||
#define CPU_TSV110 9
|
||||
// Ampere
|
||||
|
@ -57,7 +58,8 @@ static char *cpuname[] = {
|
|||
"THUNDERX2T99",
|
||||
"TSV110",
|
||||
"EMAG8180",
|
||||
"NEOVERSEN1"
|
||||
"NEOVERSEN1",
|
||||
"THUNDERX3T110"
|
||||
};
|
||||
|
||||
static char *cpuname_lower[] = {
|
||||
|
@ -72,7 +74,8 @@ static char *cpuname_lower[] = {
|
|||
"thunderx2t99",
|
||||
"tsv110",
|
||||
"emag8180",
|
||||
"neoversen1"
|
||||
"neoversen1",
|
||||
"thunderx3t110"
|
||||
};
|
||||
|
||||
int get_feature(char *search)
|
||||
|
@ -158,6 +161,8 @@ int detect(void)
|
|||
return CPU_THUNDERX;
|
||||
else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0af"))
|
||||
return CPU_THUNDERX2T99;
|
||||
else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0b8"))
|
||||
return CPU_THUNDERX3T110;
|
||||
// HiSilicon
|
||||
else if (strstr(cpu_implementer, "0x48") && strstr(cpu_part, "0xd01"))
|
||||
return CPU_TSV110;
|
||||
|
@ -372,7 +377,25 @@ void get_cpuconfig(void)
|
|||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
break;
|
||||
|
||||
case CPU_THUNDERX3T110:
|
||||
printf("#define THUNDERX3T110 \n");
|
||||
printf("#define L1_CODE_SIZE 65536 \n");
|
||||
printf("#define L1_CODE_LINESIZE 64 \n");
|
||||
printf("#define L1_CODE_ASSOCIATIVE 8 \n");
|
||||
printf("#define L1_DATA_SIZE 32768 \n");
|
||||
printf("#define L1_DATA_LINESIZE 64 \n");
|
||||
printf("#define L1_DATA_ASSOCIATIVE 8 \n");
|
||||
printf("#define L2_SIZE 524288 \n");
|
||||
printf("#define L2_LINESIZE 64 \n");
|
||||
printf("#define L2_ASSOCIATIVE 8 \n");
|
||||
printf("#define L3_SIZE 94371840 \n");
|
||||
printf("#define L3_LINESIZE 64 \n");
|
||||
printf("#define L3_ASSOCIATIVE 32 \n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
|
||||
printf("#define DTB_SIZE 4096 \n");
|
||||
break;
|
||||
}
|
||||
get_cpucount();
|
||||
}
|
||||
|
|
|
@ -38,6 +38,7 @@
|
|||
|
||||
#include <sys/utsname.h>
|
||||
#ifdef _AIX
|
||||
#include <sys/systemcfg.h>
|
||||
#include <sys/vminfo.h>
|
||||
#endif
|
||||
#ifdef __APPLE__
|
||||
|
@ -57,6 +58,7 @@
|
|||
#define CPUTYPE_PPCG4 7
|
||||
#define CPUTYPE_POWER8 8
|
||||
#define CPUTYPE_POWER9 9
|
||||
#define CPUTYPE_POWER10 10
|
||||
|
||||
char *cpuname[] = {
|
||||
"UNKNOWN",
|
||||
|
@ -68,7 +70,8 @@ char *cpuname[] = {
|
|||
"CELL",
|
||||
"PPCG4",
|
||||
"POWER8",
|
||||
"POWER9"
|
||||
"POWER9",
|
||||
"POWER10"
|
||||
};
|
||||
|
||||
char *lowercpuname[] = {
|
||||
|
@ -81,7 +84,8 @@ char *lowercpuname[] = {
|
|||
"cell",
|
||||
"ppcg4",
|
||||
"power8",
|
||||
"power9"
|
||||
"power9",
|
||||
"power10"
|
||||
};
|
||||
|
||||
char *corename[] = {
|
||||
|
@ -94,7 +98,8 @@ char *corename[] = {
|
|||
"CELL",
|
||||
"PPCG4",
|
||||
"POWER8",
|
||||
"POWER9"
|
||||
"POWER9",
|
||||
"POWER10"
|
||||
};
|
||||
|
||||
int detect(void){
|
||||
|
@ -125,6 +130,7 @@ int detect(void){
|
|||
if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6;
|
||||
if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8;
|
||||
if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9;
|
||||
if (!strncasecmp(p, "POWER10", 7)) return CPUTYPE_POWER10;
|
||||
if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL;
|
||||
if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4;
|
||||
|
||||
|
@ -132,34 +138,19 @@ int detect(void){
|
|||
#endif
|
||||
|
||||
#ifdef _AIX
|
||||
FILE *infile;
|
||||
char buffer[512], *p;
|
||||
// Cast from int to unsigned to ensure comparisons work for all bits in
|
||||
// the bit mask, even the top bit
|
||||
unsigned implementation = (unsigned) _system_configuration.implementation;
|
||||
|
||||
p = (char *)NULL;
|
||||
infile = popen("prtconf|grep 'Processor Type'", "r");
|
||||
while (fgets(buffer, sizeof(buffer), infile)){
|
||||
if (!strncmp("Pro", buffer, 3)){
|
||||
p = strchr(buffer, ':') + 2;
|
||||
#if 0
|
||||
fprintf(stderr, "%s\n", p);
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
pclose(infile);
|
||||
|
||||
if (!strncasecmp(p, "POWER3", 6)) return CPUTYPE_POWER3;
|
||||
if (!strncasecmp(p, "POWER4", 6)) return CPUTYPE_POWER4;
|
||||
if (!strncasecmp(p, "PPC970", 6)) return CPUTYPE_PPC970;
|
||||
if (!strncasecmp(p, "POWER5", 6)) return CPUTYPE_POWER5;
|
||||
if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6;
|
||||
if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6;
|
||||
if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8;
|
||||
if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9;
|
||||
if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL;
|
||||
if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4;
|
||||
return CPUTYPE_POWER5;
|
||||
if (implementation >= 0x40000u) return CPUTYPE_POWER10;
|
||||
else if (implementation & 0x20000) return CPUTYPE_POWER9;
|
||||
else if (implementation & 0x10000) return CPUTYPE_POWER8;
|
||||
else if (implementation & 0x08000) return CPUTYPE_POWER6; // POWER 7
|
||||
else if (implementation & 0x04000) return CPUTYPE_POWER6;
|
||||
else if (implementation & 0x02000) return CPUTYPE_POWER5;
|
||||
else if (implementation & 0x01000) return CPUTYPE_POWER4; // MPC7450
|
||||
else if (implementation & 0x00800) return CPUTYPE_POWER4;
|
||||
else return CPUTYPE_POWER3;
|
||||
#endif
|
||||
|
||||
#ifdef __APPLE__
|
||||
|
@ -179,6 +170,9 @@ int detect(void){
|
|||
int id;
|
||||
__asm __volatile("mfpvr %0" : "=r"(id));
|
||||
switch ( id >> 16 ) {
|
||||
case 0x80: // POWER10
|
||||
return CPUTYPE_POWER10;
|
||||
break;
|
||||
case 0x4e: // POWER9
|
||||
return CPUTYPE_POWER9;
|
||||
break;
|
||||
|
|
84
cpuid_x86.c
84
cpuid_x86.c
|
@ -249,6 +249,22 @@ int support_avx512(){
|
|||
#endif
|
||||
}
|
||||
|
||||
int support_avx512_bf16(){
|
||||
#if !defined(NO_AVX) && !defined(NO_AVX512)
|
||||
int eax, ebx, ecx, edx;
|
||||
int ret=0;
|
||||
|
||||
if (!support_avx512())
|
||||
return 0;
|
||||
cpuid_count(7, 1, &eax, &ebx, &ecx, &edx);
|
||||
if((eax & 32) == 32){
|
||||
ret=1; // CPUID.7.1:EAX[bit 5] indicates whether avx512_bf16 supported or not
|
||||
}
|
||||
return ret;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int get_vendor(void){
|
||||
int eax, ebx, ecx, edx;
|
||||
|
@ -335,6 +351,7 @@ int get_cputype(int gettype){
|
|||
if (support_avx()) feature |= HAVE_AVX;
|
||||
if (support_avx2()) feature |= HAVE_AVX2;
|
||||
if (support_avx512()) feature |= HAVE_AVX512VL;
|
||||
if (support_avx512_bf16()) feature |= HAVE_AVX512BF16;
|
||||
if ((ecx & (1 << 12)) != 0) feature |= HAVE_FMA3;
|
||||
#endif
|
||||
|
||||
|
@ -1337,6 +1354,8 @@ int get_cpuname(void){
|
|||
return CPUTYPE_NEHALEM;
|
||||
case 5:
|
||||
// Skylake X
|
||||
if(support_avx512_bf16())
|
||||
return CPUTYPE_COOPERLAKE;
|
||||
if(support_avx512())
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
|
@ -1406,6 +1425,17 @@ int get_cpuname(void){
|
|||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
case 10: //family 6 exmodel 10
|
||||
switch (model) {
|
||||
case 5: // Comet Lake H and S
|
||||
case 6: // Comet Lake U
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -1443,10 +1473,11 @@ int get_cpuname(void){
|
|||
return CPUTYPE_OPTERON;
|
||||
case 1:
|
||||
case 3:
|
||||
case 7:
|
||||
case 10:
|
||||
// case 7:
|
||||
// case 10:
|
||||
return CPUTYPE_BARCELONA;
|
||||
case 5:
|
||||
case 7:
|
||||
return CPUTYPE_BOBCAT;
|
||||
case 6:
|
||||
switch (model) {
|
||||
|
@ -1496,6 +1527,8 @@ int get_cpuname(void){
|
|||
// AMD Ryzen
|
||||
case 8:
|
||||
// AMD Ryzen2
|
||||
default:
|
||||
// Matisse/Renoir and other recent Ryzen2
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CPUTYPE_ZEN;
|
||||
|
@ -1505,6 +1538,16 @@ int get_cpuname(void){
|
|||
else
|
||||
return CPUTYPE_BARCELONA;
|
||||
}
|
||||
break;
|
||||
case 10: // Zen3
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CPUTYPE_ZEN;
|
||||
#else
|
||||
return CPUTYPE_SANDYBRIDGE; // Zen is closer in architecture to Sandy Bridge than to Excavator
|
||||
#endif
|
||||
else
|
||||
return CPUTYPE_BARCELONA;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -1653,7 +1696,8 @@ static char *cpuname[] = {
|
|||
"EXCAVATOR",
|
||||
"ZEN",
|
||||
"SKYLAKEX",
|
||||
"DHYANA"
|
||||
"DHYANA",
|
||||
"COOPERLAKE"
|
||||
};
|
||||
|
||||
static char *lowercpuname[] = {
|
||||
|
@ -1709,7 +1753,8 @@ static char *lowercpuname[] = {
|
|||
"excavator",
|
||||
"zen",
|
||||
"skylakex",
|
||||
"dhyana"
|
||||
"dhyana",
|
||||
"cooperlake"
|
||||
};
|
||||
|
||||
static char *corename[] = {
|
||||
|
@ -1742,7 +1787,8 @@ static char *corename[] = {
|
|||
"EXCAVATOR",
|
||||
"ZEN",
|
||||
"SKYLAKEX",
|
||||
"DHYANA"
|
||||
"DHYANA",
|
||||
"COOPERLAKE"
|
||||
};
|
||||
|
||||
static char *corename_lower[] = {
|
||||
|
@ -1775,7 +1821,8 @@ static char *corename_lower[] = {
|
|||
"excavator",
|
||||
"zen",
|
||||
"skylakex",
|
||||
"dhyana"
|
||||
"dhyana",
|
||||
"cooperlake"
|
||||
};
|
||||
|
||||
|
||||
|
@ -1955,6 +2002,19 @@ int get_coretype(void){
|
|||
return CORE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
case 10:
|
||||
switch (model) {
|
||||
case 5: // Comet Lake H and S
|
||||
case 6: // Comet Lake U
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CORE_HASWELL;
|
||||
#else
|
||||
return CORE_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
}
|
||||
case 5:
|
||||
switch (model) {
|
||||
case 6:
|
||||
|
@ -1970,7 +2030,9 @@ int get_coretype(void){
|
|||
case 5:
|
||||
// Skylake X
|
||||
#ifndef NO_AVX512
|
||||
return CORE_SKYLAKEX;
|
||||
if(support_avx512_bf16())
|
||||
return CORE_COOPERLAKE;
|
||||
return CORE_SKYLAKEX;
|
||||
#else
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
|
@ -2083,7 +2145,7 @@ int get_coretype(void){
|
|||
return CORE_PILEDRIVER;
|
||||
else
|
||||
return CORE_BARCELONA; //OS don't support AVX.
|
||||
case 5: // New EXCAVATOR
|
||||
case 5: // New EXCAVATOR
|
||||
if(support_avx())
|
||||
return CORE_EXCAVATOR;
|
||||
else
|
||||
|
@ -2111,12 +2173,14 @@ int get_coretype(void){
|
|||
}
|
||||
break;
|
||||
}
|
||||
} else if (exfamily == 8) {
|
||||
} else if (exfamily == 8 || exfamily == 10) {
|
||||
switch (model) {
|
||||
case 1:
|
||||
// AMD Ryzen
|
||||
case 8:
|
||||
// Ryzen 2
|
||||
default:
|
||||
// Matisse,Renoir Ryzen2 models
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CORE_ZEN;
|
||||
|
@ -2237,6 +2301,7 @@ void get_cpuconfig(void){
|
|||
if (features & HAVE_AVX ) printf("#define HAVE_AVX\n");
|
||||
if (features & HAVE_AVX2 ) printf("#define HAVE_AVX2\n");
|
||||
if (features & HAVE_AVX512VL ) printf("#define HAVE_AVX512VL\n");
|
||||
if (features & HAVE_AVX512BF16 ) printf("#define HAVE_AVX512BF16\n");
|
||||
if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n");
|
||||
if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n");
|
||||
if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n");
|
||||
|
@ -2307,6 +2372,7 @@ void get_sse(void){
|
|||
if (features & HAVE_AVX ) printf("HAVE_AVX=1\n");
|
||||
if (features & HAVE_AVX2 ) printf("HAVE_AVX2=1\n");
|
||||
if (features & HAVE_AVX512VL ) printf("HAVE_AVX512VL=1\n");
|
||||
if (features & HAVE_AVX512BF16 ) printf("HAVE_AVX512BF16=1\n");
|
||||
if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n");
|
||||
if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n");
|
||||
if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n");
|
||||
|
|
3
ctest.c
3
ctest.c
|
@ -153,3 +153,6 @@ ARCH_ARM
|
|||
ARCH_ARM64
|
||||
#endif
|
||||
|
||||
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L)
|
||||
HAVE_C11
|
||||
#endif
|
||||
|
|
|
@ -19,7 +19,10 @@ ifeq ($(ARCH), MIPS)
|
|||
USE_GEMM3M = 1
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_HALF),1)
|
||||
SHBLASOBJS += shgemm_nn.$(SUFFIX) shgemm_nt.$(SUFFIX) shgemm_tn.$(SUFFIX) shgemm_tt.$(SUFFIX)
|
||||
endif
|
||||
|
||||
SBLASOBJS += \
|
||||
sgemm_nn.$(SUFFIX) sgemm_nt.$(SUFFIX) sgemm_tn.$(SUFFIX) sgemm_tt.$(SUFFIX) \
|
||||
strmm_LNUU.$(SUFFIX) strmm_LNUN.$(SUFFIX) strmm_LNLU.$(SUFFIX) strmm_LNLN.$(SUFFIX) \
|
||||
|
@ -204,8 +207,9 @@ COMMONOBJS += gemm_thread_m.$(SUFFIX) gemm_thread_n.$(SUFFIX) gemm_thread_mn.$(
|
|||
COMMONOBJS += syrk_thread.$(SUFFIX)
|
||||
|
||||
ifndef USE_SIMPLE_THREADED_LEVEL3
|
||||
|
||||
ifeq ($(BUILD_HALF),1)
|
||||
SHBLASOBJS += shgemm_thread_nn.$(SUFFIX) shgemm_thread_nt.$(SUFFIX) shgemm_thread_tn.$(SUFFIX) shgemm_thread_tt.$(SUFFIX)
|
||||
endif
|
||||
SBLASOBJS += sgemm_thread_nn.$(SUFFIX) sgemm_thread_nt.$(SUFFIX) sgemm_thread_tn.$(SUFFIX) sgemm_thread_tt.$(SUFFIX)
|
||||
DBLASOBJS += dgemm_thread_nn.$(SUFFIX) dgemm_thread_nt.$(SUFFIX) dgemm_thread_tn.$(SUFFIX) dgemm_thread_tt.$(SUFFIX)
|
||||
QBLASOBJS += qgemm_thread_nn.$(SUFFIX) qgemm_thread_nt.$(SUFFIX) qgemm_thread_tn.$(SUFFIX) qgemm_thread_tt.$(SUFFIX)
|
||||
|
|
|
@ -333,7 +333,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
#else
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
#ifdef SKYLAKEX
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE)
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
|
|
|
@ -91,7 +91,7 @@
|
|||
#endif
|
||||
|
||||
typedef struct {
|
||||
#if __STDC_VERSION__ >= 201112L
|
||||
#ifdef HAVE_C11
|
||||
_Atomic
|
||||
#else
|
||||
volatile
|
||||
|
|
|
@ -67,7 +67,7 @@
|
|||
#endif
|
||||
|
||||
typedef struct {
|
||||
#if __STDC_VERSION__ >= 201112L
|
||||
#ifdef HAVE_C11
|
||||
_Atomic
|
||||
#else
|
||||
volatile
|
||||
|
|
|
@ -367,7 +367,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
/* Split local region of B into parts */
|
||||
for(jjs = js; jjs < MIN(n_to, js + div_n); jjs += min_jj){
|
||||
min_jj = MIN(n_to, js + div_n) - jjs;
|
||||
#ifdef SKYLAKEX
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE)
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
|
|
|
@ -135,7 +135,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
#ifdef SKYLAKEX
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE)
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
|
@ -205,7 +205,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
#ifdef SKYLAKEX
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE)
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
|
@ -300,7 +300,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
#ifdef SKYLAKEX
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE)
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
|
@ -370,7 +370,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
#ifdef SKYLAKEX
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE)
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
|
|
|
@ -122,7 +122,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
|
||||
for(jjs = 0; jjs < ls - js; jjs += min_jj){
|
||||
min_jj = ls - js - jjs;
|
||||
#ifdef SKYLAKEX
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE)
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
|
@ -146,7 +146,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
|
||||
for(jjs = 0; jjs < min_l; jjs += min_jj){
|
||||
min_jj = min_l - jjs;
|
||||
#ifdef SKYLAKEX
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE)
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
|
@ -203,7 +203,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
#ifdef SKYLAKEX
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE)
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
|
@ -258,7 +258,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
|
||||
for(jjs = 0; jjs < min_l; jjs += min_jj){
|
||||
min_jj = min_l - jjs;
|
||||
#ifdef SKYLAKEX
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE)
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
|
@ -283,7 +283,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
|
||||
for(jjs = 0; jjs < js - ls - min_l; jjs += min_jj){
|
||||
min_jj = js - ls - min_l - jjs;
|
||||
#ifdef SKYLAKEX
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE)
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
|
@ -344,7 +344,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
#ifdef SKYLAKEX
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE)
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
|
|
|
@ -47,8 +47,10 @@ endif
|
|||
endif
|
||||
|
||||
ifdef USE_CUDA
|
||||
ifeq ($(USE_CUDA), 1)
|
||||
COMMONOBJS += cuda_init.$(SUFFIX)
|
||||
endif
|
||||
endif
|
||||
|
||||
ifdef FUNCTION_PROFILE
|
||||
COMMONOBJS += profile.$(SUFFIX)
|
||||
|
|
|
@ -141,7 +141,7 @@ typedef struct {
|
|||
|
||||
} thread_status_t;
|
||||
|
||||
#if (__STDC_VERSION__ >= 201112L)
|
||||
#ifdef HAVE_C11
|
||||
#define atomic_load_queue(p) __atomic_load_n(p, __ATOMIC_RELAXED)
|
||||
#define atomic_store_queue(p, v) __atomic_store_n(p, v, __ATOMIC_RELAXED)
|
||||
#else
|
||||
|
@ -281,6 +281,8 @@ int get_node(void);
|
|||
static int increased_threads = 0;
|
||||
|
||||
#ifdef OS_LINUX
|
||||
extern int openblas_get_num_threads(void);
|
||||
|
||||
int openblas_setaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set) {
|
||||
const int active_threads = openblas_get_num_threads();
|
||||
|
||||
|
@ -602,7 +604,7 @@ int blas_thread_init(void){
|
|||
if(ret!=0){
|
||||
struct rlimit rlim;
|
||||
const char *msg = strerror(ret);
|
||||
fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create failed for thread %ld of %ld: %s\n", i+1,blas_num_threads,msg);
|
||||
fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create failed for thread %ld of %d: %s\n", i+1,blas_num_threads,msg);
|
||||
#ifdef RLIMIT_NPROC
|
||||
if(0 == getrlimit(RLIMIT_NPROC, &rlim)) {
|
||||
fprintf(STDERR, "OpenBLAS blas_thread_init: RLIMIT_NPROC "
|
||||
|
|
|
@ -55,7 +55,7 @@
|
|||
int blas_server_avail = 0;
|
||||
|
||||
static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER];
|
||||
#if __STDC_VERSION__ >= 201112L
|
||||
#ifdef HAVE_C11
|
||||
static atomic_bool blas_buffer_inuse[MAX_PARALLEL_NUMBER];
|
||||
#else
|
||||
static _Bool blas_buffer_inuse[MAX_PARALLEL_NUMBER];
|
||||
|
@ -320,7 +320,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
|
|||
|
||||
while(true) {
|
||||
for(i=0; i < MAX_PARALLEL_NUMBER; i++) {
|
||||
#if __STDC_VERSION__ >= 201112L
|
||||
#ifdef HAVE_C11
|
||||
_Bool inuse = false;
|
||||
if(atomic_compare_exchange_weak(&blas_buffer_inuse[i], &inuse, true)) {
|
||||
#else
|
||||
|
@ -335,7 +335,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
|
|||
break;
|
||||
}
|
||||
|
||||
#pragma omp parallel for schedule(OMP_SCHED)
|
||||
#pragma omp parallel for num_threads(num) schedule(OMP_SCHED)
|
||||
for (i = 0; i < num; i ++) {
|
||||
|
||||
#ifndef USE_SIMPLE_THREADED_LEVEL3
|
||||
|
@ -345,7 +345,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
|
|||
exec_threads(&queue[i], buf_index);
|
||||
}
|
||||
|
||||
#if __STDC_VERSION__ >= 201112L
|
||||
#ifdef HAVE_C11
|
||||
atomic_store(&blas_buffer_inuse[buf_index], false);
|
||||
#else
|
||||
blas_buffer_inuse[buf_index] = false;
|
||||
|
|
|
@ -332,7 +332,7 @@ int support_avx512(){
|
|||
if((ebx & (1<<7)) == 0){
|
||||
ret=0; //OS does not even support AVX2
|
||||
}
|
||||
if((ebx & (1<<31)) != 0){
|
||||
if((ebx & (1u<<31)) != 0){
|
||||
xgetbv(0, &eax, &edx);
|
||||
if((eax & 0xe0) == 0xe0)
|
||||
ret=1; //OS supports AVX512VL
|
||||
|
@ -618,6 +618,18 @@ static gotoblas_t *get_coretype(void){
|
|||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
case 10:
|
||||
if (model == 5 || model == 6) {
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
case 0xf:
|
||||
|
@ -632,7 +644,7 @@ static gotoblas_t *get_coretype(void){
|
|||
cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
|
||||
if ( (eax & 0xffff) >= 0x01) {
|
||||
cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
|
||||
if ((edx & (1 << 30)) == 0 || (edx & (1 << 31)) == 0)
|
||||
if ((edx & (1 << 30)) == 0 || (edx & (1u << 31)) == 0)
|
||||
return NULL;
|
||||
}
|
||||
else
|
||||
|
@ -644,7 +656,7 @@ static gotoblas_t *get_coretype(void){
|
|||
if ((exfamily == 0) || (exfamily == 2)) {
|
||||
if (ecx & (1 << 0)) return &gotoblas_OPTERON_SSE3;
|
||||
else return &gotoblas_OPTERON;
|
||||
} else if (exfamily == 5) {
|
||||
} else if (exfamily == 5 || exfamily == 7) {
|
||||
return &gotoblas_BOBCAT;
|
||||
} else if (exfamily == 6) {
|
||||
if(model == 1){
|
||||
|
@ -698,7 +710,7 @@ static gotoblas_t *get_coretype(void){
|
|||
}
|
||||
}
|
||||
} else if (exfamily == 8) {
|
||||
if (model == 1 || model == 8) {
|
||||
/* if (model == 1 || model == 8) */ {
|
||||
if(support_avx())
|
||||
return &gotoblas_ZEN;
|
||||
else{
|
||||
|
@ -712,10 +724,18 @@ static gotoblas_t *get_coretype(void){
|
|||
else{
|
||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
} else if (exfamily == 10) {
|
||||
if(support_avx())
|
||||
return &gotoblas_ZEN;
|
||||
else{
|
||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}else {
|
||||
return &gotoblas_BARCELONA;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -764,18 +784,53 @@ char *gotoblas_corename(void) {
|
|||
if (gotoblas == &gotoblas_NORTHWOOD) return corename[ 3];
|
||||
if (gotoblas == &gotoblas_PRESCOTT) return corename[ 4];
|
||||
if (gotoblas == &gotoblas_BANIAS) return corename[ 5];
|
||||
if (gotoblas == &gotoblas_ATOM) return corename[ 6];
|
||||
if (gotoblas == &gotoblas_ATOM)
|
||||
#ifdef DYNAMIC_OLDER
|
||||
return corename[ 6];
|
||||
#else
|
||||
return corename[10];
|
||||
#endif
|
||||
if (gotoblas == &gotoblas_CORE2) return corename[ 7];
|
||||
if (gotoblas == &gotoblas_PENRYN) return corename[ 8];
|
||||
if (gotoblas == &gotoblas_DUNNINGTON) return corename[ 9];
|
||||
if (gotoblas == &gotoblas_PENRYN)
|
||||
#ifdef DYNAMIC_OLDER
|
||||
return corename[ 8];
|
||||
#else
|
||||
return corename[7];
|
||||
#endif
|
||||
if (gotoblas == &gotoblas_DUNNINGTON)
|
||||
#ifdef DYNAMIC_OLDER
|
||||
return corename[ 9];
|
||||
#else
|
||||
return corename[7];
|
||||
#endif
|
||||
if (gotoblas == &gotoblas_NEHALEM) return corename[10];
|
||||
if (gotoblas == &gotoblas_ATHLON) return corename[11];
|
||||
if (gotoblas == &gotoblas_OPTERON_SSE3) return corename[12];
|
||||
if (gotoblas == &gotoblas_OPTERON) return corename[13];
|
||||
if (gotoblas == &gotoblas_OPTERON_SSE3)
|
||||
#ifdef DYNAMIC_OLDER
|
||||
return corename[12];
|
||||
#else
|
||||
return corename[7];
|
||||
#endif
|
||||
if (gotoblas == &gotoblas_OPTERON)
|
||||
#ifdef DYNAMIC_OLDER
|
||||
return corename[13];
|
||||
#else
|
||||
return corename[7];
|
||||
#endif
|
||||
if (gotoblas == &gotoblas_BARCELONA) return corename[14];
|
||||
if (gotoblas == &gotoblas_NANO) return corename[15];
|
||||
if (gotoblas == &gotoblas_NANO)
|
||||
#ifdef DYNAMIC_OLDER
|
||||
return corename[15];
|
||||
#else
|
||||
return corename[10];
|
||||
#endif
|
||||
if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16];
|
||||
if (gotoblas == &gotoblas_BOBCAT) return corename[17];
|
||||
if (gotoblas == &gotoblas_BOBCAT)
|
||||
#ifdef DYNAMIC_OLDER
|
||||
return corename[17];
|
||||
#else
|
||||
return corename[7];
|
||||
#endif
|
||||
if (gotoblas == &gotoblas_BULLDOZER) return corename[18];
|
||||
if (gotoblas == &gotoblas_PILEDRIVER) return corename[19];
|
||||
if (gotoblas == &gotoblas_HASWELL) return corename[20];
|
||||
|
@ -787,6 +842,7 @@ char *gotoblas_corename(void) {
|
|||
}
|
||||
|
||||
|
||||
|
||||
static gotoblas_t *force_coretype(char *coretype){
|
||||
|
||||
int i ;
|
||||
|
|
|
@ -53,10 +53,11 @@ extern gotoblas_t gotoblas_THUNDERX2T99;
|
|||
extern gotoblas_t gotoblas_TSV110;
|
||||
extern gotoblas_t gotoblas_EMAG8180;
|
||||
extern gotoblas_t gotoblas_NEOVERSEN1;
|
||||
extern gotoblas_t gotoblas_THUNDERX3T110;
|
||||
|
||||
extern void openblas_warning(int verbose, const char * msg);
|
||||
|
||||
#define NUM_CORETYPES 11
|
||||
#define NUM_CORETYPES 12
|
||||
|
||||
/*
|
||||
* In case asm/hwcap.h is outdated on the build system, make sure
|
||||
|
@ -82,6 +83,7 @@ static char *corename[] = {
|
|||
"tsv110",
|
||||
"emag8180",
|
||||
"neoversen1",
|
||||
"thunderx3t110",
|
||||
"unknown"
|
||||
};
|
||||
|
||||
|
@ -97,6 +99,7 @@ char *gotoblas_corename(void) {
|
|||
if (gotoblas == &gotoblas_TSV110) return corename[ 8];
|
||||
if (gotoblas == &gotoblas_EMAG8180) return corename[ 9];
|
||||
if (gotoblas == &gotoblas_NEOVERSEN1) return corename[10];
|
||||
if (gotoblas == &gotoblas_THUNDERX3T110) return corename[11];
|
||||
return corename[NUM_CORETYPES];
|
||||
}
|
||||
|
||||
|
@ -127,6 +130,7 @@ static gotoblas_t *force_coretype(char *coretype) {
|
|||
case 8: return (&gotoblas_TSV110);
|
||||
case 9: return (&gotoblas_EMAG8180);
|
||||
case 10: return (&gotoblas_NEOVERSEN1);
|
||||
case 11: return (&gotoblas_THUNDERX3T110);
|
||||
}
|
||||
snprintf(message, 128, "Core not found: %s\n", coretype);
|
||||
openblas_warning(1, message);
|
||||
|
@ -190,6 +194,8 @@ static gotoblas_t *get_coretype(void) {
|
|||
return &gotoblas_THUNDERX;
|
||||
case 0x0af: // ThunderX2
|
||||
return &gotoblas_THUNDERX2T99;
|
||||
case 0x0b8: // ThunderX3
|
||||
return &gotoblas_THUNDERX3T110;
|
||||
}
|
||||
break;
|
||||
case 0x48: // HiSilicon
|
||||
|
|
|
@ -6,6 +6,13 @@ extern gotoblas_t gotoblas_POWER8;
|
|||
#if (!defined __GNUC__) || ( __GNUC__ >= 6)
|
||||
extern gotoblas_t gotoblas_POWER9;
|
||||
#endif
|
||||
#if (!defined __GNUC__) || ( __GNUC__ >= 11) \
|
||||
|| (__GNUC__ == 10 && __GNUC_MINOR__ >= 2)
|
||||
#define HAVE_P10_SUPPORT 1
|
||||
#endif
|
||||
#ifdef HAVE_P10_SUPPORT
|
||||
extern gotoblas_t gotoblas_POWER10;
|
||||
#endif
|
||||
|
||||
extern void openblas_warning(int verbose, const char *msg);
|
||||
|
||||
|
@ -13,7 +20,8 @@ static char *corename[] = {
|
|||
"unknown",
|
||||
"POWER6",
|
||||
"POWER8",
|
||||
"POWER9"
|
||||
"POWER9",
|
||||
"POWER10"
|
||||
};
|
||||
|
||||
#define NUM_CORETYPES 4
|
||||
|
@ -23,6 +31,9 @@ char *gotoblas_corename(void) {
|
|||
if (gotoblas == &gotoblas_POWER8) return corename[2];
|
||||
#if (!defined __GNUC__) || ( __GNUC__ >= 6)
|
||||
if (gotoblas == &gotoblas_POWER9) return corename[3];
|
||||
#endif
|
||||
#ifdef HAVE_P10_SUPPORT
|
||||
if (gotoblas == &gotoblas_POWER10) return corename[4];
|
||||
#endif
|
||||
return corename[0];
|
||||
}
|
||||
|
@ -36,6 +47,10 @@ static gotoblas_t *get_coretype(void) {
|
|||
#if (!defined __GNUC__) || ( __GNUC__ >= 6)
|
||||
if (__builtin_cpu_is("power9"))
|
||||
return &gotoblas_POWER9;
|
||||
#endif
|
||||
#ifdef HAVE_P10_SUPPORT
|
||||
if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma"))
|
||||
return &gotoblas_POWER10;
|
||||
#endif
|
||||
return NULL;
|
||||
}
|
||||
|
@ -61,6 +76,9 @@ static gotoblas_t *force_coretype(char * coretype) {
|
|||
case 2: return (&gotoblas_POWER8);
|
||||
#if (!defined __GNUC__) || ( __GNUC__ >= 6)
|
||||
case 3: return (&gotoblas_POWER9);
|
||||
#endif
|
||||
#ifdef HAVE_P10_SUPPORT
|
||||
case 4: return (&gotoblas_POWER10);
|
||||
#endif
|
||||
default: return NULL;
|
||||
}
|
||||
|
|
|
@ -1,12 +1,58 @@
|
|||
|
||||
#include "common.h"
|
||||
#include <stdbool.h>
|
||||
|
||||
// Gate kernels for z13 and z14 on gcc version
|
||||
#if (__GNUC__ == 5 && __GNUC_MINOR__ >= 2) || __GNUC__ >= 6 || \
|
||||
/* RHEL 7 since 7.3: */ \
|
||||
(__GNUC__ == 4 && __GNUC_MINOR__ == 8 && __GNUC_PATCHLEVEL__ == 5 && \
|
||||
__GNUC_RH_RELEASE__ >= 11)
|
||||
#define HAVE_Z13_SUPPORT
|
||||
#endif
|
||||
|
||||
#if __GNUC__ >= 7
|
||||
#define HAVE_Z14_SUPPORT
|
||||
#endif
|
||||
|
||||
// Guard the use of getauxval() on glibc version >= 2.16
|
||||
#ifdef __GLIBC__
|
||||
#include <features.h>
|
||||
#if __GLIBC_PREREQ(2, 16)
|
||||
#include <sys/auxv.h>
|
||||
#define HAVE_GETAUXVAL 1
|
||||
|
||||
static unsigned long get_hwcap(void)
|
||||
{
|
||||
unsigned long hwcap = getauxval(AT_HWCAP);
|
||||
char *maskenv;
|
||||
|
||||
// honor requests for not using specific CPU features in LD_HWCAP_MASK
|
||||
maskenv = getenv("LD_HWCAP_MASK");
|
||||
if (maskenv)
|
||||
hwcap &= strtoul(maskenv, NULL, 0);
|
||||
|
||||
return hwcap;
|
||||
// note that a missing auxval is interpreted as no capabilities
|
||||
// available, which is safe.
|
||||
}
|
||||
|
||||
#else // __GLIBC_PREREQ(2, 16)
|
||||
#warn "Cannot detect SIMD support in Z13 or newer architectures since glibc is older than 2.16"
|
||||
|
||||
static unsigned long get_hwcap(void) {
|
||||
// treat missing support for getauxval() as no capabilities available,
|
||||
// which is safe.
|
||||
return 0;
|
||||
}
|
||||
#endif // __GLIBC_PREREQ(2, 16)
|
||||
#endif // __GLIBC
|
||||
|
||||
extern gotoblas_t gotoblas_ZARCH_GENERIC;
|
||||
#ifdef HAVE_Z13_SUPPORT
|
||||
extern gotoblas_t gotoblas_Z13;
|
||||
#endif
|
||||
#ifdef HAVE_Z14_SUPPORT
|
||||
extern gotoblas_t gotoblas_Z14;
|
||||
//extern gotoblas_t gotoblas_Z15;
|
||||
//#if (!defined C_GCC) || (GCC_VERSION >= 60000)
|
||||
//extern gotoblas_t gotoblas_Z14;
|
||||
//#endif
|
||||
#endif
|
||||
|
||||
#define NUM_CORETYPES 4
|
||||
|
||||
|
@ -16,47 +62,50 @@ static char* corename[] = {
|
|||
"unknown",
|
||||
"Z13",
|
||||
"Z14",
|
||||
// "Z15",
|
||||
"ZARCH_GENERIC",
|
||||
};
|
||||
|
||||
char* gotoblas_corename(void) {
|
||||
#ifdef HAVE_Z13_SUPPORT
|
||||
if (gotoblas == &gotoblas_Z13) return corename[1];
|
||||
#endif
|
||||
#ifdef HAVE_Z14_SUPPORT
|
||||
if (gotoblas == &gotoblas_Z14) return corename[2];
|
||||
// if (gotoblas == &gotoblas_Z15) return corename[3];
|
||||
//#if (!defined C_GCC) || (GCC_VERSION >= 60000)
|
||||
// if (gotoblas == &gotoblas_POWER9) return corename[3];
|
||||
//#endif
|
||||
return corename[0]; // try generic?
|
||||
#endif
|
||||
if (gotoblas == &gotoblas_ZARCH_GENERIC) return corename[3];
|
||||
|
||||
return corename[0];
|
||||
}
|
||||
|
||||
// __builtin_cpu_is is not supported by zarch
|
||||
/**
|
||||
* Detect the fitting set of kernels by retrieving the CPU features supported by
|
||||
* OS from the auxiliary value AT_HWCAP and choosing the set of kernels
|
||||
* ("coretype") that exploits most of the features and can be compiled with the
|
||||
* available gcc version.
|
||||
* Note that we cannot use vector registers on a z13 or newer unless supported
|
||||
* by the OS kernel (which needs to handle them properly during context switch).
|
||||
*/
|
||||
static gotoblas_t* get_coretype(void) {
|
||||
FILE* infile;
|
||||
char buffer[512], * p;
|
||||
|
||||
p = (char*)NULL;
|
||||
infile = fopen("/proc/sysinfo", "r");
|
||||
while (fgets(buffer, sizeof(buffer), infile)) {
|
||||
if (!strncmp("Type", buffer, 4)) {
|
||||
p = strchr(buffer, ':') + 2;
|
||||
#if 0
|
||||
fprintf(stderr, "%s\n", p);
|
||||
unsigned long hwcap __attribute__((unused)) = get_hwcap();
|
||||
|
||||
// z14 and z15 systems: exploit Vector Facility (SIMD) and
|
||||
// Vector-Enhancements Facility 1 (float SIMD instructions), if present.
|
||||
#ifdef HAVE_Z14_SUPPORT
|
||||
if ((hwcap & HWCAP_S390_VX) && (hwcap & HWCAP_S390_VXE))
|
||||
return &gotoblas_Z14;
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
fclose(infile);
|
||||
// z13: Vector Facility (SIMD for double)
|
||||
#ifdef HAVE_Z13_SUPPORT
|
||||
if (hwcap & HWCAP_S390_VX)
|
||||
return &gotoblas_Z13;
|
||||
#endif
|
||||
|
||||
if (strstr(p, "2964")) return &gotoblas_Z13;
|
||||
if (strstr(p, "2965")) return &gotoblas_Z13;
|
||||
if (strstr(p, "3906")) return &gotoblas_Z14;
|
||||
if (strstr(p, "3907")) return &gotoblas_Z14;
|
||||
if (strstr(p, "8561")) return &gotoblas_Z14; // fallback z15 to z14
|
||||
if (strstr(p, "8562")) return &gotoblas_Z14; // fallback z15 to z14
|
||||
|
||||
return NULL; // should be ZARCH_GENERIC
|
||||
// fallback in case of missing compiler support, systems before z13, or
|
||||
// when the OS does not advertise support for the Vector Facility (e.g.,
|
||||
// missing support in the OS kernel)
|
||||
return &gotoblas_ZARCH_GENERIC;
|
||||
}
|
||||
|
||||
static gotoblas_t* force_coretype(char* coretype) {
|
||||
|
@ -76,12 +125,13 @@ static gotoblas_t* force_coretype(char* coretype) {
|
|||
|
||||
switch (found)
|
||||
{
|
||||
#ifdef HAVE_Z13_SUPPORT
|
||||
case 1: return (&gotoblas_Z13);
|
||||
#endif
|
||||
#ifdef HAVE_Z14_SUPPORT
|
||||
case 2: return (&gotoblas_Z14);
|
||||
// case 3: return (&gotoblas_Z15);
|
||||
//#if (!defined C_GCC) || (GCC_VERSION >= 60000)
|
||||
// case 3: return (&gotoblas_POWER9);
|
||||
//#endif
|
||||
#endif
|
||||
case 3: return (&gotoblas_ZARCH_GENERIC);
|
||||
default: return NULL;
|
||||
}
|
||||
snprintf(message, 128, "Core not found: %s\n", coretype);
|
||||
|
@ -109,9 +159,9 @@ void gotoblas_dynamic_init(void) {
|
|||
|
||||
if (gotoblas == NULL)
|
||||
{
|
||||
snprintf(coremsg, 128, "Falling back to Z14 core\n");
|
||||
snprintf(coremsg, 128, "Failed to detect system, falling back to generic z support.\n");
|
||||
openblas_warning(1, coremsg);
|
||||
gotoblas = &gotoblas_Z14;
|
||||
gotoblas = &gotoblas_ZARCH_GENERIC;
|
||||
}
|
||||
|
||||
if (gotoblas && gotoblas->init) {
|
||||
|
|
|
@ -1095,7 +1095,7 @@ static BLASULONG base_address = 0UL;
|
|||
static BLASULONG base_address = BASE_ADDRESS;
|
||||
#endif
|
||||
|
||||
#if __STDC_VERSION__ >= 201112L
|
||||
#ifdef HAVE_C11
|
||||
static _Atomic int memory_initialized = 0;
|
||||
#else
|
||||
static volatile int memory_initialized = 0;
|
||||
|
@ -2070,7 +2070,7 @@ if (!release->address) return;
|
|||
if (munmap(release -> address, BUFFER_SIZE)) {
|
||||
int errsv=errno;
|
||||
perror("OpenBLAS : munmap failed:");
|
||||
printf("error code=%d,\trelease->address=%lx\n",errsv,release->address);
|
||||
printf("error code=%d,\trelease->address=%p\n",errsv,release->address);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -180,9 +180,10 @@ int get_L2_size(void){
|
|||
int eax, ebx, ecx, edx;
|
||||
|
||||
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \
|
||||
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
|
||||
defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \
|
||||
defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || defined(SKYLAKEX)
|
||||
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
|
||||
defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \
|
||||
defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || \
|
||||
defined(ZEN) || defined(SKYLAKEX) || defined(COOPERLAKE)
|
||||
|
||||
cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
|
@ -266,7 +267,9 @@ int get_L2_size(void){
|
|||
void blas_set_parameter(void){
|
||||
|
||||
int factor;
|
||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || defined(SKYLAKEX)
|
||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || \
|
||||
defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || \
|
||||
defined(SKYLAKEX) || defined(COOPERLAKE)
|
||||
int size = 16;
|
||||
#else
|
||||
int size = get_L2_size();
|
||||
|
|
|
@ -30,6 +30,10 @@ ifndef BUILD_LAPACK_DEPRECATED
|
|||
BUILD_LAPACK_DEPRECATED = 0
|
||||
endif
|
||||
|
||||
ifndef BUILD_HALF
|
||||
BUILD_HALF = 0
|
||||
endif
|
||||
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
ifeq ($(F_COMPILER), GFORTRAN)
|
||||
ifndef ONLY_CBLAS
|
||||
|
@ -51,6 +55,10 @@ endif
|
|||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(C_COMPILER), PGI)
|
||||
EXTRALIB += -pgf90libs
|
||||
endif
|
||||
|
||||
ifneq (,$(filter 1 2,$(NOFORTRAN)))
|
||||
FEXTRALIB =
|
||||
endif
|
||||
|
@ -151,8 +159,12 @@ ifeq ($(F_COMPILER), INTEL)
|
|||
-Wl,--whole-archive $< -Wl,--no-whole-archive \
|
||||
-Wl,-soname,$(INTERNALNAME) $(EXTRALIB)
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
|
||||
else ifeq ($(F_COMPILER), FLANG)
|
||||
$(FC) $(FFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
|
||||
-Wl,--whole-archive $< -Wl,--no-whole-archive \
|
||||
-Wl,-soname,$(INTERNALNAME) $(EXTRALIB)
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
|
||||
else
|
||||
|
||||
ifneq ($(C_COMPILER), LSB)
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
|
||||
-Wl,--whole-archive $< -Wl,--no-whole-archive \
|
||||
|
@ -234,23 +246,23 @@ static : ../$(LIBNAME)
|
|||
rm -f goto.$(SUFFIX)
|
||||
|
||||
osx.def : gensymbol ../Makefile.system ../getarch.c
|
||||
perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
|
||||
perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F)
|
||||
|
||||
aix.def : gensymbol ../Makefile.system ../getarch.c
|
||||
perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
|
||||
perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F)
|
||||
|
||||
objcopy.def : gensymbol ../Makefile.system ../getarch.c
|
||||
perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
|
||||
perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F)
|
||||
|
||||
objconv.def : gensymbol ../Makefile.system ../getarch.c
|
||||
perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
|
||||
perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F)
|
||||
|
||||
test : linktest.c
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK.
|
||||
rm -f linktest
|
||||
|
||||
linktest.c : gensymbol ../Makefile.system ../getarch.c
|
||||
perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > linktest.c
|
||||
perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > linktest.c
|
||||
|
||||
clean ::
|
||||
@rm -f *.def *.dylib __.SYMDEF* *.renamed
|
||||
|
|
|
@ -30,7 +30,7 @@
|
|||
icamax,icamin,idamax,idamin,idmax,idmin,isamax,isamin,ismax,ismin,
|
||||
izamax,izamin,lsame,samax,samin,sasum,saxpy,scabs1,scamax,
|
||||
scamin,scasum,scnrm2,scopy,sdot,sdsdot,sgbmv,sgemm,sgemv,sger,
|
||||
shgemm, smax,smin,snrm2,
|
||||
smax,smin,snrm2,
|
||||
srot,srotg,srotm,srotmg,ssbmv,sscal,sspmv,sspr2,sspr,sswap,
|
||||
ssymm,ssymv,ssyr2,ssyr2k,ssyr,ssyrk,stbmv,stbsv,stpmv,stpsv,
|
||||
strmm,strmv,strsm,strsv,zaxpy,zcopy,zdotc,zdotu,zdrot,
|
||||
|
@ -40,17 +40,13 @@
|
|||
ztbsv,ztpmv,ztpsv,ztrmm,ztrmv,ztrsm,ztrsv,
|
||||
xerbla,
|
||||
saxpby,daxpby,caxpby,zaxpby,
|
||||
somatcopy, domatcopy, comatcopy, zomatcopy,
|
||||
simatcopy, dimatcopy, cimatcopy, zimatcopy,
|
||||
sgeadd,dgeadd,cgeadd,zgeadd,
|
||||
somatcopy,
|
||||
simatcopy,
|
||||
domatcopy,
|
||||
dimatcopy,
|
||||
comatcopy,
|
||||
cimatcopy,
|
||||
zomatcopy,
|
||||
zimatcopy,
|
||||
ssum, dsum, scsum, dzsum
|
||||
);
|
||||
|
||||
@halfblasobjs = (shgemm);
|
||||
@cblasobjs = (
|
||||
cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv,
|
||||
cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k,
|
||||
|
@ -67,7 +63,7 @@
|
|||
cblas_isamax, cblas_izamax,
|
||||
cblas_sasum, cblas_saxpy,
|
||||
cblas_scasum, cblas_scnrm2, cblas_scopy, cblas_sdot, cblas_sdsdot, cblas_sgbmv, cblas_sgemm,
|
||||
cblas_sgemv, cblas_sger, cblas_shgemm, cblas_snrm2, cblas_srot, cblas_srotg,
|
||||
cblas_sgemv, cblas_sger, cblas_snrm2, cblas_srot, cblas_srotg,
|
||||
cblas_srotm, cblas_srotmg, cblas_ssbmv, cblas_sscal, cblas_sspmv, cblas_sspr2, cblas_sspr,
|
||||
cblas_sswap, cblas_ssymm, cblas_ssymv, cblas_ssyr2, cblas_ssyr2k, cblas_ssyr, cblas_ssyrk,
|
||||
cblas_stbmv, cblas_stbsv, cblas_stpmv, cblas_stpsv, cblas_strmm, cblas_strmv, cblas_strsm,
|
||||
|
@ -80,9 +76,16 @@
|
|||
cblas_saxpby,cblas_daxpby,cblas_caxpby,cblas_zaxpby,
|
||||
cblas_somatcopy, cblas_domatcopy, cblas_comatcopy, cblas_zomatcopy,
|
||||
cblas_simatcopy, cblas_dimatcopy, cblas_cimatcopy, cblas_zimatcopy,
|
||||
cblas_sgeadd, cblas_dgeadd,cblas_cgeadd, cblas_zgeadd
|
||||
cblas_sgeadd, cblas_dgeadd,cblas_cgeadd, cblas_zgeadd,
|
||||
cblas_isamin, cblas_idamin, cblas_icamin, cblas_izamin,
|
||||
cblas_ismin, cblas_idmin, cblas_icmin, cblas_izmin,
|
||||
cblas_ismax, cblas_idmax, cblas_icmax, cblas_izmax,
|
||||
cblas_ssum, cblas_dsum, cblas_scsum, cblas_dzsum,
|
||||
cblas_xerbla
|
||||
);
|
||||
|
||||
@halfcblasobjs = (cblas_shgemm);
|
||||
|
||||
@exblasobjs = (
|
||||
qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm,
|
||||
qgemv,qger,qmax,qmin,
|
||||
|
@ -3454,6 +3457,10 @@ use File::Spec;
|
|||
use File::Basename;
|
||||
my $dirname = File::Spec->catfile(dirname(dirname(File::Spec->rel2abs(__FILE__))), "lapack-netlib");
|
||||
|
||||
if ($ARGV[12] == 1) {
|
||||
@blasobjs = (@blasobjs, @halfblasobjs);
|
||||
@cblasobjs = (@cblasobjs, @halfcblasobjs);
|
||||
}
|
||||
if ($ARGV[8] == 1) {
|
||||
#ONLY_CBLAS=1
|
||||
@underscore_objs = (@misc_underscore_objs);
|
||||
|
@ -3494,9 +3501,12 @@ if ($ARGV[1] eq "x86") { @underscore_objs = (@underscore_objs, @gemm3mobjs);
|
|||
if ($ARGV[1] eq "ia64") { @underscore_objs = (@underscore_objs, @gemm3mobjs); };
|
||||
if ($ARGV[1] eq "MIPS") { @underscore_objs = (@underscore_objs, @gemm3mobjs); };
|
||||
|
||||
|
||||
if ($ARGV[4] == 0) {
|
||||
@no_underscore_objs = (@cblasobjs, @misc_no_underscore_objs);
|
||||
if ($ARGV[1] eq "x86_64") { @no_underscore_objs = (@no_underscore_objs, @cblasgemm3mobjs); };
|
||||
if ($ARGV[1] eq "x86") { @no_underscore_objs = (@no_underscore_objs, @cblasgemm3mobjs); };
|
||||
if ($ARGV[1] eq "ia64") { @no_underscore_objs = (@no_underscore_objs, @cblasgemm3mobjs); };
|
||||
if ($ARGV[1] eq "MIPS") { @no_underscore_objs = (@no_underscore_objs, @cblasgemm3mobjs); };
|
||||
}else{
|
||||
#NO_CBLAS=1
|
||||
@no_underscore_objs = (@misc_no_underscore_objs);
|
||||
|
|
6
f_check
6
f_check
|
@ -82,6 +82,9 @@ if ($compiler eq "") {
|
|||
if ($compiler =~ /flang/) {
|
||||
$vendor = FLANG;
|
||||
$openmp = "-fopenmp";
|
||||
} elsif ($compiler =~ /pgf/) {
|
||||
$vendor = PGI;
|
||||
$openmp = "-mp";
|
||||
} else {
|
||||
$vendor = G77;
|
||||
$openmp = "";
|
||||
|
@ -334,7 +337,8 @@ if ($link ne "") {
|
|||
&& ($flags !~ /kernel32/)
|
||||
&& ($flags !~ /advapi32/)
|
||||
&& ($flags !~ /shell32/)
|
||||
&& ($flags !~ /omp/)
|
||||
&& ($flags !~ /omp/ || ($vendor !~ /PGI/ && $flags =~ /omp/))
|
||||
&& ($flags !~ /[0-9]+/)
|
||||
&& ($flags !~ /^\-l$/)
|
||||
) {
|
||||
$linker_l .= $flags . " ";
|
||||
|
|
77
getarch.c
77
getarch.c
|
@ -90,11 +90,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include <sys/sysinfo.h>
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#if defined(AIX)
|
||||
#include <sys/sysinfo.h>
|
||||
#endif
|
||||
|
||||
#if defined(__x86_64__) || defined(_M_X64)
|
||||
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
|
||||
#else
|
||||
#define NO_AVX512
|
||||
#endif
|
||||
#endif
|
||||
/* #define FORCE_P2 */
|
||||
/* #define FORCE_KATMAI */
|
||||
/* #define FORCE_COPPERMINE */
|
||||
|
@ -360,6 +365,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_COOPERLAKE
|
||||
#ifdef NO_AVX512
|
||||
#define FORCE
|
||||
#define FORCE_INTEL
|
||||
#define ARCHITECTURE "X86"
|
||||
#define SUBARCHITECTURE "HASWELL"
|
||||
#define ARCHCONFIG "-DHASWELL " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
|
||||
"-DFMA3"
|
||||
#define LIBNAME "haswell"
|
||||
#define CORENAME "HASWELL"
|
||||
#else
|
||||
#define FORCE
|
||||
#define FORCE_INTEL
|
||||
#define ARCHITECTURE "X86"
|
||||
#define SUBARCHITECTURE "COOPERLAKE"
|
||||
#define ARCHCONFIG "-DCOOPERLAKE " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
|
||||
"-DFMA3 -DHAVE_AVX512VL -DHAVE_AVX512BF16 -march=cooperlake"
|
||||
#define LIBNAME "cooperlake"
|
||||
#define CORENAME "COOPERLAKE"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_ATOM
|
||||
#define FORCE
|
||||
#define FORCE_INTEL
|
||||
|
@ -650,6 +685,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define CORENAME "POWER9"
|
||||
#endif
|
||||
|
||||
#if defined(FORCE_POWER10)
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "POWER"
|
||||
#define SUBARCHITECTURE "POWER10"
|
||||
#define SUBDIRNAME "power"
|
||||
#define ARCHCONFIG "-DPOWER10 " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \
|
||||
"-DL2_SIZE=4194304 -DL2_LINESIZE=128 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
|
||||
#define LIBNAME "power10"
|
||||
#define CORENAME "POWER10"
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_PPCG4
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "POWER"
|
||||
|
@ -1156,6 +1204,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define CORENAME "EMAG8180"
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_THUNDERX3T110
|
||||
#define ARMV8
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ARM64"
|
||||
#define SUBARCHITECTURE "THUNDERX3T110"
|
||||
#define SUBDIRNAME "arm64"
|
||||
#define ARCHCONFIG "-DTHUNDERX3T110 " \
|
||||
"-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \
|
||||
"-DL2_SIZE=524288 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \
|
||||
"-DL3_SIZE=94371840 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
|
||||
#define LIBNAME "thunderx3t110"
|
||||
#define CORENAME "THUNDERX3T110"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_ZARCH_GENERIC
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ZARCH"
|
||||
|
@ -1284,6 +1350,11 @@ static int get_num_cores(void) {
|
|||
sysctl(m, 2, &count, &len, NULL, 0);
|
||||
|
||||
return count;
|
||||
|
||||
#elif defined(AIX)
|
||||
//returns the number of processors which are currently online
|
||||
return sysconf(_SC_NPROCESSORS_ONLN);
|
||||
|
||||
#else
|
||||
return 2;
|
||||
#endif
|
||||
|
@ -1362,10 +1433,12 @@ int main(int argc, char *argv[]){
|
|||
|
||||
#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
printf("__BYTE_ORDER__=__ORDER_BIG_ENDIAN__\n");
|
||||
#endif
|
||||
#if defined(__BIG_ENDIAN__) && __BIG_ENDIAN__ > 0
|
||||
#elif defined(__BIG_ENDIAN__) && __BIG_ENDIAN__ > 0
|
||||
printf("__BYTE_ORDER__=__ORDER_BIG_ENDIAN__\n");
|
||||
#endif
|
||||
#if defined(_CALL_ELF) && (_CALL_ELF == 2)
|
||||
printf("ELF_VERSION=2\n");
|
||||
#endif
|
||||
|
||||
#ifdef MAKE_NB_JOBS
|
||||
#if MAKE_NB_JOBS > 0
|
||||
|
|
|
@ -115,7 +115,7 @@ foreach (float_type ${FLOAT_TYPES})
|
|||
GenerateNamedObjects("syr2k.c" "HEMM" "her2k" ${CBLAS_FLAG} "" "" false ${float_type})
|
||||
|
||||
if (USE_GEMM3M)
|
||||
GenerateNamedObjects("gemm.c" "GEMM3M" "gemm3m" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("gemm.c" "GEMM3M" "gemm3m" ${CBLAS_FLAG} "" "" false ${float_type})
|
||||
endif()
|
||||
endif ()
|
||||
if (${float_type} STREQUAL "COMPLEX")
|
||||
|
|
|
@ -46,7 +46,9 @@ SBLAS3OBJS = \
|
|||
somatcopy.$(SUFFIX) simatcopy.$(SUFFIX)\
|
||||
sgeadd.$(SUFFIX)
|
||||
|
||||
ifeq ($(BUILD_HALF),1)
|
||||
SHBLAS3OBJS = shgemm.$(SUFFIX)
|
||||
endif
|
||||
|
||||
DBLAS1OBJS = \
|
||||
daxpy.$(SUFFIX) dswap.$(SUFFIX) \
|
||||
|
@ -278,7 +280,9 @@ CSBLAS3OBJS = \
|
|||
cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX)\
|
||||
cblas_sgeadd.$(SUFFIX)
|
||||
|
||||
ifeq ($(BUILD_HALF),1)
|
||||
CSHBLAS3OBJS = cblas_shgemm.$(SUFFIX)
|
||||
endif
|
||||
|
||||
CDBLAS1OBJS = \
|
||||
cblas_idamax.$(SUFFIX) cblas_idamin.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \
|
||||
|
@ -363,7 +367,7 @@ CZBLAS3OBJS += cblas_zgemm3m.$(SUFFIX)
|
|||
endif
|
||||
|
||||
|
||||
ifndef NO_CBLAS
|
||||
ifneq ($(NO_CBLAS), 1)
|
||||
|
||||
override CFLAGS += -I.
|
||||
|
||||
|
@ -1214,8 +1218,10 @@ zhpr2.$(SUFFIX) zhpr2.$(PSUFFIX) : zhpr2.c
|
|||
xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
ifeq ($(BUILD_HALF),1)
|
||||
shgemm.$(SUFFIX) shgemm.$(PSUFFIX) : gemm.c ../param.h
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
endif
|
||||
|
||||
sgemm.$(SUFFIX) sgemm.$(PSUFFIX) : gemm.c ../param.h
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
@ -1778,8 +1784,10 @@ cblas_zhemv.$(SUFFIX) cblas_zhemv.$(PSUFFIX) : zhemv.c
|
|||
cblas_sgemm.$(SUFFIX) cblas_sgemm.$(PSUFFIX) : gemm.c ../param.h
|
||||
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
ifeq ($(BUILD_HALF),1)
|
||||
cblas_shgemm.$(SUFFIX) cblas_shgemm.$(PSUFFIX) : gemm.c ../param.h
|
||||
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
|
||||
endif
|
||||
|
||||
cblas_dgemm.$(SUFFIX) cblas_dgemm.$(PSUFFIX) : gemm.c ../param.h
|
||||
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
|
||||
|
|
|
@ -324,8 +324,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
|
|||
#ifdef DYNAMIC_ARCH
|
||||
if (support_avx512() )
|
||||
#endif
|
||||
if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && sgemm_kernel_direct_performant(m,n,k)) {
|
||||
sgemm_kernel_direct(m, n, k, a, lda, b, ldb, c, ldc);
|
||||
if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && SGEMM_DIRECT_PERFORMANT(m,n,k)) {
|
||||
SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc);
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
|
@ -42,7 +42,7 @@
|
|||
#include "functable.h"
|
||||
#endif
|
||||
|
||||
#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8)
|
||||
#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) || defined(THUNDERX3T110)
|
||||
// Multithreaded swap gives performance benefits in ThunderX2T99
|
||||
#else
|
||||
// Disable multi-threading as it does not show any performance
|
||||
|
|
|
@ -42,7 +42,7 @@
|
|||
#include "functable.h"
|
||||
#endif
|
||||
|
||||
#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8)
|
||||
#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) || defined(THUNDERX3T110)
|
||||
// Multithreaded swap gives performance benefits in ThunderX2T99
|
||||
#else
|
||||
// Disable multi-threading as it does not show any performance
|
||||
|
|
|
@ -127,17 +127,35 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
|||
|
||||
# Makefile.L3
|
||||
set(USE_TRMM false)
|
||||
if (ARM OR ARM64 OR (TARGET_CORE MATCHES LONGSOON3B) OR (TARGET_CORE MATCHES GENERIC) OR (TARGET_CORE MATCHES HASWELL) OR (TARGET_CORE MATCHES ZEN) OR (TARGET_CORE MATCHES SKYLAKEX) )
|
||||
if (ARM OR ARM64 OR (TARGET_CORE MATCHES LONGSOON3B) OR (TARGET_CORE MATCHES GENERIC) OR (TARGET_CORE MATCHES HASWELL) OR (TARGET_CORE MATCHES ZEN) OR (TARGET_CORE MATCHES SKYLAKEX) OR (TARGET_CORE MATCHES COOPERLAKE))
|
||||
set(USE_TRMM true)
|
||||
endif ()
|
||||
if (ZARCH OR (TARGET_CORE MATCHES POWER8) OR (TARGET_CORE MATCHES POWER9))
|
||||
if (ZARCH OR (TARGET_CORE MATCHES POWER8) OR (TARGET_CORE MATCHES POWER9) OR (TARGET_CORE MATCHES POWER10))
|
||||
set(USE_TRMM true)
|
||||
endif ()
|
||||
|
||||
set(USE_DIRECT_SGEMM false)
|
||||
if (X86_64)
|
||||
set(USE_DIRECT_SGEMM true)
|
||||
endif()
|
||||
|
||||
if (USE_DIRECT_SGEMM)
|
||||
# if (NOT DEFINED SGEMMDIRECTKERNEL)
|
||||
set (SGEMMDIRECTKERNEL sgemm_direct_skylakex.c)
|
||||
set (SGEMMDIRECTPERFORMANT sgemm_direct_performant.c)
|
||||
# endif()
|
||||
GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTKERNEL}" "" "gemm_direct" false "" "" false SINGLE)
|
||||
GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPERFORMANT}" "" "gemm_direct_performant" false "" "" false SINGLE)
|
||||
endif()
|
||||
|
||||
foreach (float_type SINGLE DOUBLE HALF)
|
||||
string(SUBSTRING ${float_type} 0 1 float_char)
|
||||
if (${float_type} STREQUAL "HALF")
|
||||
set (float_char "SH")
|
||||
if (NOT ${BUILD_HALF})
|
||||
continue ()
|
||||
else ()
|
||||
set (float_char "SH")
|
||||
endif ()
|
||||
endif ()
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type})
|
||||
endforeach()
|
||||
|
|
|
@ -8,8 +8,14 @@ include $(TOPDIR)/Makefile.system
|
|||
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
|
||||
GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10)
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), power)
|
||||
ifeq ($(C_COMPILER), CLANG)
|
||||
override CFLAGS += -fno-integrated-as
|
||||
endif
|
||||
endif
|
||||
AVX2OPT =
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
# AVX2 support was added in 4.7.0
|
||||
|
@ -32,7 +38,22 @@ ifdef NO_AVX2
|
|||
endif
|
||||
|
||||
ifdef TARGET_CORE
|
||||
ifeq ($(TARGET_CORE), SKYLAKEX)
|
||||
ifeq ($(TARGET_CORE), COOPERLAKE)
|
||||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
|
||||
ifeq ($(GCCVERSIONGTEQ10), 1)
|
||||
override CFLAGS += -march=cooperlake
|
||||
else
|
||||
override CFLAGS += -march=skylake-avx512
|
||||
endif
|
||||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
override CFLAGS += -fno-asynchronous-unwind-tables
|
||||
endif
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
override CFLAGS += -fno-asynchronous-unwind-tables
|
||||
endif
|
||||
endif
|
||||
else ifeq ($(TARGET_CORE), SKYLAKEX)
|
||||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512
|
||||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
override CFLAGS += -fno-asynchronous-unwind-tables
|
||||
|
|
|
@ -9,6 +9,10 @@ ifeq ($(ARCH), x86_64)
|
|||
USE_GEMM3M = 1
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), x86_64)
|
||||
USE_DIRECT_SGEMM = 1
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), ia64)
|
||||
USE_GEMM3M = 1
|
||||
endif
|
||||
|
@ -39,18 +43,28 @@ ifeq ($(CORE), SKYLAKEX)
|
|||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), COOPERLAKE)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), ZEN)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), POWER8)
|
||||
ifeq ($(BINARY64),1)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), POWER9)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), POWER10)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), zarch)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
@ -59,7 +73,15 @@ ifeq ($(CORE), Z14)
|
|||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
#ifndef SHGEMMKERNEL
|
||||
ifdef USE_DIRECT_SGEMM
|
||||
ifndef SGEMMDIRECTKERNEL
|
||||
SGEMMDIRECTKERNEL = sgemm_direct_skylakex.c
|
||||
SGEMMDIRECTPERFORMANT = sgemm_direct_performant.c
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_HALF), 1)
|
||||
ifndef SHGEMMKERNEL
|
||||
SHGEMM_BETA = ../generic/gemm_beta.c
|
||||
SHGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
SHGEMMINCOPY = ../generic/gemm_ncopy_2.c
|
||||
|
@ -70,18 +92,25 @@ SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX)
|
|||
SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
#endif
|
||||
endif
|
||||
|
||||
SHKERNELOBJS += \
|
||||
shgemm_kernel$(TSUFFIX).$(SUFFIX) \
|
||||
$(SHGEMMINCOPYOBJ) $(SHGEMMITCOPYOBJ) \
|
||||
$(SHGEMMONCOPYOBJ) $(SHGEMMOTCOPYOBJ)
|
||||
endif
|
||||
|
||||
SKERNELOBJS += \
|
||||
sgemm_kernel$(TSUFFIX).$(SUFFIX) \
|
||||
$(SGEMMINCOPYOBJ) $(SGEMMITCOPYOBJ) \
|
||||
$(SGEMMONCOPYOBJ) $(SGEMMOTCOPYOBJ)
|
||||
|
||||
ifdef USE_DIRECT_SGEMM
|
||||
SKERNELOBJS += \
|
||||
sgemm_direct$(TSUFFIX).$(SUFFIX) \
|
||||
sgemm_direct_performant$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
DKERNELOBJS += \
|
||||
dgemm_kernel$(TSUFFIX).$(SUFFIX) \
|
||||
$(DGEMMINCOPYOBJ) $(DGEMMITCOPYOBJ) \
|
||||
|
@ -110,7 +139,9 @@ XKERNELOBJS += \
|
|||
$(XGEMMINCOPYOBJ) $(XGEMMITCOPYOBJ) \
|
||||
$(XGEMMONCOPYOBJ) $(XGEMMOTCOPYOBJ)
|
||||
|
||||
ifeq ($(BUILD_HALF),1)
|
||||
SHBLASOBJS += $(SHKERNELOBJS)
|
||||
endif
|
||||
SBLASOBJS += $(SKERNELOBJS)
|
||||
DBLASOBJS += $(DKERNELOBJS)
|
||||
QBLASOBJS += $(QKERNELOBJS)
|
||||
|
@ -118,7 +149,10 @@ CBLASOBJS += $(CKERNELOBJS)
|
|||
ZBLASOBJS += $(ZKERNELOBJS)
|
||||
XBLASOBJS += $(XKERNELOBJS)
|
||||
|
||||
ifeq ($(BUILD_HALF),1)
|
||||
SHBLASOBJS += shgemm_beta$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
SBLASOBJS += \
|
||||
sgemm_beta$(TSUFFIX).$(SUFFIX) \
|
||||
strmm_kernel_LN$(TSUFFIX).$(SUFFIX) strmm_kernel_LT$(TSUFFIX).$(SUFFIX) \
|
||||
|
@ -461,11 +495,13 @@ ZBLASOBJS += \
|
|||
zimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \
|
||||
zgeadd_k$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
|
||||
ifeq ($(BUILD_HALF), 1)
|
||||
SHGEMMINCOPYOBJ_P = $(SHGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||
SHGEMMITCOPYOBJ_P = $(SHGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||
SHGEMMONCOPYOBJ_P = $(SHGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||
SHGEMMOTCOPYOBJ_P = $(SHGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||
endif
|
||||
|
||||
SGEMMINCOPYOBJ_P = $(SGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||
SGEMMITCOPYOBJ_P = $(SGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||
SGEMMONCOPYOBJ_P = $(SGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||
|
@ -491,8 +527,10 @@ XGEMMITCOPYOBJ_P = $(XGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
|||
XGEMMONCOPYOBJ_P = $(XGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||
XGEMMOTCOPYOBJ_P = $(XGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||
|
||||
ifeq ($(BUILD_HALF),1)
|
||||
$(KDIR)shgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_BETA)
|
||||
$(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)sgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_BETA)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
@ -512,12 +550,16 @@ $(KDIR)zgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA)
|
|||
$(KDIR)xgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMM_BETA)
|
||||
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@
|
||||
|
||||
|
||||
ifeq ($(BUILD_HALF), 1)
|
||||
|
||||
$(KDIR)$(SHGEMMONCOPYOBJ) : $(KERNELDIR)/$(SHGEMMONCOPY)
|
||||
$(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
$(KDIR)$(SHGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SHGEMMOTCOPY)
|
||||
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DHALF -UDOUBLE -UCOMPLEX $< -o shgemmotcopy.s
|
||||
$(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX $< -o - > shgemmotcopy.s
|
||||
m4 shgemmotcopy.s > shgemmotcopy_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemmotcopy_nomacros.s -o $@
|
||||
rm shgemmotcopy.s shgemmotcopy_nomacros.s
|
||||
|
@ -532,7 +574,7 @@ $(KDIR)$(SHGEMMINCOPYOBJ) : $(KERNELDIR)/$(SHGEMMINCOPY)
|
|||
|
||||
$(KDIR)$(SHGEMMITCOPYOBJ) : $(KERNELDIR)/$(SHGEMMITCOPY)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DHALF -UDOUBLE -UCOMPLEX $< -o shgemmitcopy.s
|
||||
$(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX $< -o - > shgemmitcopy.s
|
||||
m4 shgemmitcopy.s > shgemmitcopy_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemmitcopy_nomacros.s -o $@
|
||||
rm shgemmitcopy.s shgemmitcopy_nomacros.s
|
||||
|
@ -540,6 +582,7 @@ else
|
|||
$(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@
|
||||
endif
|
||||
|
||||
endif
|
||||
endif
|
||||
|
||||
$(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY)
|
||||
|
@ -547,7 +590,7 @@ $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY)
|
|||
|
||||
$(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmotcopy.s
|
||||
$(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmotcopy.s
|
||||
m4 sgemmotcopy.s > sgemmotcopy_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmotcopy_nomacros.s -o $@
|
||||
rm sgemmotcopy.s sgemmotcopy_nomacros.s
|
||||
|
@ -563,7 +606,7 @@ $(KDIR)$(SGEMMINCOPYOBJ) : $(KERNELDIR)/$(SGEMMINCOPY)
|
|||
|
||||
$(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmitcopy.s
|
||||
$(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmitcopy.s
|
||||
m4 sgemmitcopy.s > sgemmitcopy_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmitcopy_nomacros.s -o $@
|
||||
rm sgemmitcopy.s sgemmitcopy_nomacros.s
|
||||
|
@ -575,7 +618,7 @@ endif
|
|||
|
||||
$(KDIR)$(DGEMMONCOPYOBJ) : $(KERNELDIR)/$(DGEMMONCOPY)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_ncopy.s
|
||||
$(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_ncopy.s
|
||||
m4 dgemm_ncopy.s > dgemm_ncopy_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_ncopy_nomacros.s -o $@
|
||||
rm dgemm_ncopy.s dgemm_ncopy_nomacros.s
|
||||
|
@ -593,7 +636,7 @@ $(KDIR)$(DGEMMINCOPYOBJ) : $(KERNELDIR)/$(DGEMMINCOPY)
|
|||
|
||||
$(KDIR)$(DGEMMITCOPYOBJ) : $(KERNELDIR)/$(DGEMMITCOPY)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_itcopy.s
|
||||
$(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_itcopy.s
|
||||
m4 dgemm_itcopy.s > dgemm_itcopy_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_itcopy_nomacros.s -o $@
|
||||
rm dgemm_itcopy.s dgemm_itcopy_nomacros.s
|
||||
|
@ -636,7 +679,7 @@ $(KDIR)$(CGEMMINCOPYOBJ) : $(KERNELDIR)/$(CGEMMINCOPY)
|
|||
|
||||
$(KDIR)$(CGEMMITCOPYOBJ) : $(KERNELDIR)/$(CGEMMITCOPY)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -E $< -o cgemm_itcopy.s
|
||||
$(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -S $< -o - > cgemm_itcopy.s
|
||||
m4 cgemm_itcopy.s > cgemm_itcopy_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX cgemm_itcopy_nomacros.s -o $@
|
||||
rm cgemm_itcopy.s cgemm_itcopy_nomacros.s
|
||||
|
@ -659,7 +702,7 @@ $(KDIR)$(ZGEMMINCOPYOBJ) : $(KERNELDIR)/$(ZGEMMINCOPY)
|
|||
|
||||
$(KDIR)$(ZGEMMITCOPYOBJ) : $(KERNELDIR)/$(ZGEMMITCOPY)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o zgemm_itcopy.s
|
||||
$(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > zgemm_itcopy.s
|
||||
m4 zgemm_itcopy.s > zgemm_itcopy_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX zgemm_itcopy_nomacros.s -o $@
|
||||
rm zgemm_itcopy.s zgemm_itcopy_nomacros.s
|
||||
|
@ -691,7 +734,7 @@ endif
|
|||
|
||||
$(KDIR)sgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemm_kernel$(TSUFFIX).s
|
||||
$(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemm_kernel$(TSUFFIX).s
|
||||
m4 sgemm_kernel$(TSUFFIX).s > sgemm_kernel$(TSUFFIX)_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemm_kernel$(TSUFFIX)_nomacros.s -o $@
|
||||
rm sgemm_kernel$(TSUFFIX).s sgemm_kernel$(TSUFFIX)_nomacros.s
|
||||
|
@ -699,19 +742,29 @@ else
|
|||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||
endif
|
||||
|
||||
ifdef USE_DIRECT_SGEMM
|
||||
$(KDIR)sgemm_direct_performant$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTPERFORMANT)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||
$(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_HALF), 1)
|
||||
|
||||
$(KDIR)shgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DHALF -UDOUBLE -UCOMPLEX $< -o shgemm_kernel$(TSUFFIX).s
|
||||
$(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX $< -o - > shgemm_kernel$(TSUFFIX).s
|
||||
m4 shgemm_kernel$(TSUFFIX).s > shgemm_kernel$(TSUFFIX)_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemm_kernel$(TSUFFIX)_nomacros.s -o $@
|
||||
rm shgemm_kernel$(TSUFFIX).s shgemm_kernel$(TSUFFIX)_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@
|
||||
endif
|
||||
endif
|
||||
|
||||
$(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_kernel$(TSUFFIX).s
|
||||
$(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_kernel$(TSUFFIX).s
|
||||
m4 dgemm_kernel$(TSUFFIX).s > dgemm_kernel$(TSUFFIX)_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_kernel$(TSUFFIX)_nomacros.s -o $@
|
||||
rm dgemm_kernel$(TSUFFIX).s dgemm_kernel$(TSUFFIX)_nomacros.s
|
||||
|
@ -724,7 +777,7 @@ $(KDIR)qgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEP
|
|||
|
||||
$(KDIR)cgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNN $< -o cgemm_kernel_n.s
|
||||
$(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DNN $< -o - > cgemm_kernel_n.s
|
||||
m4 cgemm_kernel_n.s > cgemm_kernel_n_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN cgemm_kernel_n_nomacros.s -o $@
|
||||
rm cgemm_kernel_n.s cgemm_kernel_n_nomacros.s
|
||||
|
@ -734,7 +787,7 @@ endif
|
|||
|
||||
$(KDIR)cgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCN $< -o cgemm_kernel_l.s
|
||||
$(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DCN $< -o - > cgemm_kernel_l.s
|
||||
m4 cgemm_kernel_l.s > cgemm_kernel_l_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN cgemm_kernel_l_nomacros.s -o $@
|
||||
rm cgemm_kernel_l.s cgemm_kernel_l_nomacros.s
|
||||
|
@ -744,7 +797,7 @@ endif
|
|||
|
||||
$(KDIR)cgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s
|
||||
$(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DNC $< -o - > cgemm_kernel_r.s
|
||||
m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@
|
||||
rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s
|
||||
|
@ -754,7 +807,7 @@ endif
|
|||
|
||||
$(KDIR)cgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCC $< -o cgemm_kernel_b.s
|
||||
$(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DCC $< -o - > cgemm_kernel_b.s
|
||||
m4 cgemm_kernel_b.s > cgemm_kernel_b_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC cgemm_kernel_b_nomacros.s -o $@
|
||||
rm cgemm_kernel_b.s cgemm_kernel_b_nomacros.s
|
||||
|
@ -764,7 +817,7 @@ endif
|
|||
|
||||
$(KDIR)zgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNN $< -o zgemm_kernel_n.s
|
||||
$(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DNN $< -o - > zgemm_kernel_n.s
|
||||
m4 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@
|
||||
rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s
|
||||
|
@ -774,7 +827,7 @@ endif
|
|||
|
||||
$(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCN $< -o zgemm_kernel_l.s
|
||||
$(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DCN $< -o - > zgemm_kernel_l.s
|
||||
m4 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@
|
||||
rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s
|
||||
|
@ -784,7 +837,7 @@ endif
|
|||
|
||||
$(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNC $< -o zgemm_kernel_r.s
|
||||
$(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DNC $< -o - > zgemm_kernel_r.s
|
||||
m4 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@
|
||||
rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s
|
||||
|
@ -794,7 +847,7 @@ endif
|
|||
|
||||
$(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCC $< -o zgemm_kernel_b.s
|
||||
$(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DCC $< -o - > zgemm_kernel_b.s
|
||||
m4 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@
|
||||
rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s
|
||||
|
@ -818,7 +871,7 @@ $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD
|
|||
ifdef USE_TRMM
|
||||
$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o strmmkernel_ln.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o - > strmmkernel_ln.s
|
||||
m4 strmmkernel_ln.s > strmmkernel_ln_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA strmmkernel_ln_nomacros.s -o $@
|
||||
rm strmmkernel_ln.s strmmkernel_ln_nomacros.s
|
||||
|
@ -828,7 +881,7 @@ endif
|
|||
|
||||
$(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o strmmkernel_lt.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o - > strmmkernel_lt.s
|
||||
m4 strmmkernel_lt.s > strmmkernel_lt_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA strmmkernel_lt_nomacros.s -o $@
|
||||
rm strmmkernel_lt.s strmmkernel_lt_nomacros.s
|
||||
|
@ -838,7 +891,7 @@ endif
|
|||
|
||||
$(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o strmmkernel_rn.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o - > strmmkernel_rn.s
|
||||
m4 strmmkernel_rn.s > strmmkernel_rn_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA strmmkernel_rn_nomacros.s -o $@
|
||||
rm strmmkernel_rn.s strmmkernel_rn_nomacros.s
|
||||
|
@ -848,7 +901,7 @@ endif
|
|||
|
||||
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s
|
||||
m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
|
||||
rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
|
||||
|
@ -858,7 +911,7 @@ endif
|
|||
|
||||
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o dtrmm_kernel_ln.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o - > dtrmm_kernel_ln.s
|
||||
m4 dtrmm_kernel_ln.s > dtrmm_kernel_ln_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA dtrmm_kernel_ln_nomacros.s -o $@
|
||||
rm dtrmm_kernel_ln.s dtrmm_kernel_ln_nomacros.s
|
||||
|
@ -868,7 +921,7 @@ endif
|
|||
|
||||
$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o dtrmm_kernel_lt.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o - > dtrmm_kernel_lt.s
|
||||
m4 dtrmm_kernel_lt.s > dtrmm_kernel_lt_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA dtrmm_kernel_lt_nomacros.s -o $@
|
||||
rm dtrmm_kernel_lt.s dtrmm_kernel_lt_nomacros.s
|
||||
|
@ -878,7 +931,7 @@ endif
|
|||
|
||||
$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o dtrmm_kernel_rn.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o - > dtrmm_kernel_rn.s
|
||||
m4 dtrmm_kernel_rn.s > dtrmm_kernel_rn_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA dtrmm_kernel_rn_nomacros.s -o $@
|
||||
rm dtrmm_kernel_rn.s dtrmm_kernel_rn_nomacros.s
|
||||
|
@ -888,7 +941,7 @@ endif
|
|||
|
||||
$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o dtrmm_kernel_rt.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > dtrmm_kernel_rt.s
|
||||
m4 dtrmm_kernel_rt.s > dtrmm_kernel_rt_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA dtrmm_kernel_rt_nomacros.s -o $@
|
||||
rm dtrmm_kernel_rt.s dtrmm_kernel_rt_nomacros.s
|
||||
|
@ -910,7 +963,7 @@ $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
|
|||
|
||||
$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_ln.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_ln.s
|
||||
m4 ctrmm_kernel_ln.s > ctrmm_kernel_ln_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_ln_nomacros.s -o $@
|
||||
rm ctrmm_kernel_ln.s ctrmm_kernel_ln_nomacros.s
|
||||
|
@ -920,7 +973,7 @@ endif
|
|||
|
||||
$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_lt.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_lt.s
|
||||
m4 ctrmm_kernel_lt.s > ctrmm_kernel_lt_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_lt_nomacros.s -o $@
|
||||
rm ctrmm_kernel_lt.s ctrmm_kernel_lt_nomacros.s
|
||||
|
@ -930,7 +983,7 @@ endif
|
|||
|
||||
$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lr.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o - > ctrmm_kernel_lr.s
|
||||
m4 ctrmm_kernel_lr.s > ctrmm_kernel_lr_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ctrmm_kernel_lr_nomacros.s -o $@
|
||||
rm ctrmm_kernel_lr.s ctrmm_kernel_lr_nomacros.s
|
||||
|
@ -940,7 +993,7 @@ endif
|
|||
|
||||
$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lc.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o - > ctrmm_kernel_lc.s
|
||||
m4 ctrmm_kernel_lc.s > ctrmm_kernel_lc_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ctrmm_kernel_lc_nomacros.s -o $@
|
||||
rm ctrmm_kernel_lc_nomacros.s ctrmm_kernel_lc.s
|
||||
|
@ -950,7 +1003,7 @@ endif
|
|||
|
||||
$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rn.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_rn.s
|
||||
m4 ctrmm_kernel_rn.s > ctrmm_kernel_rn_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_rn_nomacros.s -o $@
|
||||
rm ctrmm_kernel_rn.s ctrmm_kernel_rn_nomacros.s
|
||||
|
@ -960,7 +1013,7 @@ endif
|
|||
|
||||
$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rt.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_rt.s
|
||||
m4 ctrmm_kernel_rt.s > ctrmm_kernel_rt_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_rt_nomacros.s -o $@
|
||||
rm ctrmm_kernel_rt.s ctrmm_kernel_rt_nomacros.s
|
||||
|
@ -970,7 +1023,7 @@ endif
|
|||
|
||||
$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ctrmm_kernel_rr.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o - > ctrmm_kernel_rr.s
|
||||
m4 ctrmm_kernel_rr.s > ctrmm_kernel_rr_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ctrmm_kernel_rr_nomacros.s -o $@
|
||||
rm ctrmm_kernel_rr.s ctrmm_kernel_rr_nomacros.s
|
||||
|
@ -980,7 +1033,7 @@ endif
|
|||
|
||||
$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ctrmm_kernel_RC.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o - > ctrmm_kernel_RC.s
|
||||
m4 ctrmm_kernel_RC.s > ctrmm_kernel_RC_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ctrmm_kernel_RC_nomacros.s -o $@
|
||||
rm ctrmm_kernel_RC.s ctrmm_kernel_RC_nomacros.s
|
||||
|
@ -990,7 +1043,7 @@ endif
|
|||
|
||||
$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_ln.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_ln.s
|
||||
m4 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@
|
||||
rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s
|
||||
|
@ -1000,7 +1053,7 @@ endif
|
|||
|
||||
$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_lt.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_lt.s
|
||||
m4 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@
|
||||
rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s
|
||||
|
@ -1010,7 +1063,7 @@ endif
|
|||
|
||||
$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lr.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o - > ztrmm_kernel_lr.s
|
||||
m4 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@
|
||||
rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s
|
||||
|
@ -1020,7 +1073,7 @@ endif
|
|||
|
||||
$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lc.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o - > ztrmm_kernel_lc.s
|
||||
m4 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@
|
||||
rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s
|
||||
|
@ -1030,7 +1083,7 @@ endif
|
|||
|
||||
$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rn.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_rn.s
|
||||
m4 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@
|
||||
rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s
|
||||
|
@ -1040,7 +1093,7 @@ endif
|
|||
|
||||
$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rt.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_rt.s
|
||||
m4 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@
|
||||
rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s
|
||||
|
@ -1050,7 +1103,7 @@ endif
|
|||
|
||||
$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rr.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o - > ztrmm_kernel_rr.s
|
||||
m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@
|
||||
rm ztrmm_kernel_rr.s ztrmm_kernel_rr_nomacros.s
|
||||
|
@ -1060,7 +1113,7 @@ endif
|
|||
|
||||
$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rc.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o - > ztrmm_kernel_rc.s
|
||||
m4 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@
|
||||
rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s
|
||||
|
@ -1080,7 +1133,7 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
|
|||
|
||||
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s
|
||||
m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
|
||||
rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
|
||||
|
@ -1214,7 +1267,7 @@ $(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DT
|
|||
|
||||
$(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o dtrsm_kernel_lt.s
|
||||
$(CC) $(CFLAGS) -S -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o - > dtrsm_kernel_lt.s
|
||||
m4 dtrsm_kernel_lt.s > dtrsm_kernel_lt_nomacros.s
|
||||
$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ dtrsm_kernel_lt_nomacros.s -o $@
|
||||
rm dtrsm_kernel_lt.s dtrsm_kernel_lt_nomacros.s
|
||||
|
@ -2325,8 +2378,10 @@ $(KDIR)xtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_
|
|||
$(KDIR)sgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMM_BETA)
|
||||
$(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
ifeq ($(BUILD_HALF),1)
|
||||
$(KDIR)shgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHGEMM_BETA)
|
||||
$(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)dgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMM_BETA)
|
||||
$(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
||||
|
@ -2343,6 +2398,8 @@ $(KDIR)zgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA)
|
|||
$(KDIR)xgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMM_BETA)
|
||||
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@
|
||||
|
||||
|
||||
ifeq ($(BUILD_HALF), 1)
|
||||
$(SHGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMONCOPY)
|
||||
$(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
|
@ -2357,6 +2414,8 @@ $(SHGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMITCOPY)
|
|||
$(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
endif
|
||||
endif
|
||||
|
||||
$(SGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMONCOPY)
|
||||
$(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
|
@ -2373,7 +2432,7 @@ $(SGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMITCOPY)
|
|||
|
||||
endif
|
||||
|
||||
$(D<GEMMONCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMONCOPY)
|
||||
$(DGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMONCOPY)
|
||||
$(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
$(DGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMOTCOPY)
|
||||
|
@ -2461,8 +2520,11 @@ endif
|
|||
|
||||
endif
|
||||
|
||||
|
||||
ifeq ($(BUILD_HALF), 1)
|
||||
$(KDIR)shgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND)
|
||||
$(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)sgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND)
|
||||
$(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
@ -2481,7 +2543,7 @@ $(KDIR)cgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMM
|
|||
|
||||
$(KDIR)cgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(PFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s
|
||||
$(CC) $(PFLAGS) -S -UDOUBLE -DCOMPLEX -DNC $< -o - > cgemm_kernel_r.s
|
||||
m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s
|
||||
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@
|
||||
rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s
|
||||
|
@ -2527,7 +2589,7 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
|
|||
|
||||
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s
|
||||
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s
|
||||
m4 strmmkernel_rn.s > strmm_kernel_rt_nomacros.s
|
||||
$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
|
||||
rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
|
||||
|
|
|
@ -48,10 +48,12 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
|
|||
|
||||
dot[0]=0.0;
|
||||
dot[1]=0.0;
|
||||
|
||||
#if !defined(__PPC__)
|
||||
CREAL(result) = 0.0 ;
|
||||
CIMAG(result) = 0.0 ;
|
||||
|
||||
#else
|
||||
result = OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0);
|
||||
#endif
|
||||
if ( n < 1 ) return(result);
|
||||
|
||||
inc_x2 = 2 * inc_x ;
|
||||
|
@ -71,8 +73,12 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
|
|||
i++ ;
|
||||
|
||||
}
|
||||
CREAL(result) = dot[0];
|
||||
#if !defined(__POWER__)
|
||||
CREAL(result) = dot[0];
|
||||
CIMAG(result) = dot[1];
|
||||
#else
|
||||
result = OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0],dot[1]);
|
||||
#endif
|
||||
return(result);
|
||||
|
||||
}
|
||||
|
|
|
@ -1,3 +1,187 @@
|
|||
include $(KERNELDIR)/KERNEL.ARMV8
|
||||
SAMINKERNEL = ../arm/amin.c
|
||||
DAMINKERNEL = ../arm/amin.c
|
||||
CAMINKERNEL = ../arm/zamin.c
|
||||
ZAMINKERNEL = ../arm/zamin.c
|
||||
|
||||
SMAXKERNEL = ../arm/max.c
|
||||
DMAXKERNEL = ../arm/max.c
|
||||
|
||||
SMINKERNEL = ../arm/min.c
|
||||
DMINKERNEL = ../arm/min.c
|
||||
|
||||
ISAMINKERNEL = ../arm/iamin.c
|
||||
IDAMINKERNEL = ../arm/iamin.c
|
||||
ICAMINKERNEL = ../arm/izamin.c
|
||||
IZAMINKERNEL = ../arm/izamin.c
|
||||
|
||||
ISMAXKERNEL = ../arm/imax.c
|
||||
IDMAXKERNEL = ../arm/imax.c
|
||||
|
||||
ISMINKERNEL = ../arm/imin.c
|
||||
IDMINKERNEL = ../arm/imin.c
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
SAMAXKERNEL = amax.S
|
||||
DAMAXKERNEL = amax.S
|
||||
CAMAXKERNEL = zamax.S
|
||||
ZAMAXKERNEL = zamax.S
|
||||
|
||||
SAXPYKERNEL = axpy.S
|
||||
DAXPYKERNEL = axpy.S
|
||||
CAXPYKERNEL = zaxpy.S
|
||||
ZAXPYKERNEL = zaxpy.S
|
||||
|
||||
SROTKERNEL = rot.S
|
||||
DROTKERNEL = rot.S
|
||||
CROTKERNEL = zrot.S
|
||||
ZROTKERNEL = zrot.S
|
||||
|
||||
SSCALKERNEL = scal.S
|
||||
DSCALKERNEL = scal.S
|
||||
CSCALKERNEL = zscal.S
|
||||
ZSCALKERNEL = zscal.S
|
||||
|
||||
SGEMVNKERNEL = gemv_n.S
|
||||
DGEMVNKERNEL = gemv_n.S
|
||||
CGEMVNKERNEL = zgemv_n.S
|
||||
ZGEMVNKERNEL = zgemv_n.S
|
||||
|
||||
SGEMVTKERNEL = gemv_t.S
|
||||
DGEMVTKERNEL = gemv_t.S
|
||||
CGEMVTKERNEL = zgemv_t.S
|
||||
ZGEMVTKERNEL = zgemv_t.S
|
||||
|
||||
|
||||
SASUMKERNEL = asum.S
|
||||
DASUMKERNEL = asum.S
|
||||
CASUMKERNEL = casum.S
|
||||
ZASUMKERNEL = zasum.S
|
||||
|
||||
SCOPYKERNEL = copy.S
|
||||
DCOPYKERNEL = copy.S
|
||||
CCOPYKERNEL = copy.S
|
||||
ZCOPYKERNEL = copy.S
|
||||
|
||||
SSWAPKERNEL = swap.S
|
||||
DSWAPKERNEL = swap.S
|
||||
CSWAPKERNEL = swap.S
|
||||
ZSWAPKERNEL = swap.S
|
||||
|
||||
ISAMAXKERNEL = iamax.S
|
||||
IDAMAXKERNEL = iamax.S
|
||||
ICAMAXKERNEL = izamax.S
|
||||
IZAMAXKERNEL = izamax.S
|
||||
|
||||
SNRM2KERNEL = nrm2.S
|
||||
DNRM2KERNEL = nrm2.S
|
||||
CNRM2KERNEL = znrm2.S
|
||||
ZNRM2KERNEL = znrm2.S
|
||||
|
||||
DDOTKERNEL = dot.S
|
||||
SDOTKERNEL = dot.S
|
||||
CDOTKERNEL = zdot.S
|
||||
ZDOTKERNEL = zdot.S
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
DGEMM_BETA = dgemm_beta.S
|
||||
SGEMM_BETA = sgemm_beta.S
|
||||
|
||||
ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8)
|
||||
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S
|
||||
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S
|
||||
else
|
||||
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
endif
|
||||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||
ifeq ($(SGEMM_UNROLL_M), 16)
|
||||
SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S
|
||||
else
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
|
||||
endif
|
||||
ifeq ($(SGEMM_UNROLL_M), 4)
|
||||
SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S
|
||||
else
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
|
||||
endif
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S
|
||||
SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||
|
||||
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_M), 8)
|
||||
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
|
||||
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
|
||||
else
|
||||
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
|
||||
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
|
||||
endif
|
||||
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_N), 4)
|
||||
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
|
||||
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
|
||||
else
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
|
||||
endif
|
||||
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
|
|
@ -0,0 +1,184 @@
|
|||
SAMINKERNEL = ../arm/amin.c
|
||||
DAMINKERNEL = ../arm/amin.c
|
||||
CAMINKERNEL = ../arm/zamin.c
|
||||
ZAMINKERNEL = ../arm/zamin.c
|
||||
|
||||
SMAXKERNEL = ../arm/max.c
|
||||
DMAXKERNEL = ../arm/max.c
|
||||
|
||||
SMINKERNEL = ../arm/min.c
|
||||
DMINKERNEL = ../arm/min.c
|
||||
|
||||
ISAMINKERNEL = ../arm/iamin.c
|
||||
IDAMINKERNEL = ../arm/iamin.c
|
||||
ICAMINKERNEL = ../arm/izamin.c
|
||||
IZAMINKERNEL = ../arm/izamin.c
|
||||
|
||||
ISMAXKERNEL = ../arm/imax.c
|
||||
IDMAXKERNEL = ../arm/imax.c
|
||||
|
||||
ISMINKERNEL = ../arm/imin.c
|
||||
IDMINKERNEL = ../arm/imin.c
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
SAMAXKERNEL = amax.S
|
||||
DAMAXKERNEL = amax.S
|
||||
CAMAXKERNEL = zamax.S
|
||||
ZAMAXKERNEL = zamax.S
|
||||
|
||||
SAXPYKERNEL = axpy.S
|
||||
DAXPYKERNEL = daxpy_thunderx2t99.S
|
||||
CAXPYKERNEL = zaxpy.S
|
||||
ZAXPYKERNEL = zaxpy.S
|
||||
|
||||
SROTKERNEL = rot.S
|
||||
DROTKERNEL = rot.S
|
||||
CROTKERNEL = zrot.S
|
||||
ZROTKERNEL = zrot.S
|
||||
|
||||
SSCALKERNEL = scal.S
|
||||
DSCALKERNEL = scal.S
|
||||
CSCALKERNEL = zscal.S
|
||||
ZSCALKERNEL = zscal.S
|
||||
|
||||
SGEMVNKERNEL = gemv_n.S
|
||||
DGEMVNKERNEL = gemv_n.S
|
||||
CGEMVNKERNEL = zgemv_n.S
|
||||
ZGEMVNKERNEL = zgemv_n.S
|
||||
|
||||
SGEMVTKERNEL = gemv_t.S
|
||||
DGEMVTKERNEL = gemv_t.S
|
||||
CGEMVTKERNEL = zgemv_t.S
|
||||
ZGEMVTKERNEL = zgemv_t.S
|
||||
|
||||
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||
|
||||
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_M), 8)
|
||||
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
|
||||
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
|
||||
else
|
||||
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
|
||||
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
|
||||
endif
|
||||
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_N), 4)
|
||||
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
|
||||
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
|
||||
else
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
|
||||
endif
|
||||
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
SASUMKERNEL = sasum_thunderx2t99.c
|
||||
DASUMKERNEL = dasum_thunderx2t99.c
|
||||
CASUMKERNEL = casum_thunderx2t99.c
|
||||
ZASUMKERNEL = zasum_thunderx2t99.c
|
||||
|
||||
SCOPYKERNEL = copy_thunderx2t99.c
|
||||
DCOPYKERNEL = copy_thunderx2t99.c
|
||||
CCOPYKERNEL = copy_thunderx2t99.c
|
||||
ZCOPYKERNEL = copy_thunderx2t99.c
|
||||
|
||||
SSWAPKERNEL = swap_thunderx2t99.S
|
||||
DSWAPKERNEL = swap_thunderx2t99.S
|
||||
CSWAPKERNEL = swap_thunderx2t99.S
|
||||
ZSWAPKERNEL = swap_thunderx2t99.S
|
||||
|
||||
ISAMAXKERNEL = iamax_thunderx2t99.c
|
||||
IDAMAXKERNEL = iamax_thunderx2t99.c
|
||||
ICAMAXKERNEL = izamax_thunderx2t99.c
|
||||
IZAMAXKERNEL = izamax_thunderx2t99.c
|
||||
|
||||
SNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||
CNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||
#DNRM2KERNEL = dznrm2_thunderx2t99_fast.c
|
||||
#ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c
|
||||
DNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||
ZNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||
|
||||
|
||||
DDOTKERNEL = dot_thunderx2t99.c
|
||||
SDOTKERNEL = dot_thunderx2t99.c
|
||||
CDOTKERNEL = zdot_thunderx2t99.c
|
||||
ZDOTKERNEL = zdot_thunderx2t99.c
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4)
|
||||
DGEMMKERNEL = dgemm_kernel_8x4_thunderx2t99.S
|
||||
endif
|
||||
|
||||
ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 16x4)
|
||||
SGEMMKERNEL = sgemm_kernel_16x4_thunderx2t99.S
|
||||
endif
|
||||
|
||||
ifeq ($(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N), 8x4)
|
||||
CGEMMKERNEL = cgemm_kernel_8x4_thunderx2t99.S
|
||||
endif
|
||||
|
||||
ifeq ($(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N), 4x4)
|
||||
ZGEMMKERNEL = zgemm_kernel_4x4_thunderx2t99.S
|
||||
endif
|
|
@ -98,11 +98,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
add X, X, #128
|
||||
.endm
|
||||
|
||||
/*
|
||||
* No need to do software prefetches if the vector fits
|
||||
* into L1 cache
|
||||
*/
|
||||
.macro KERNEL_F16_L1CACHE
|
||||
ldp q4, q5, [X]
|
||||
ldp q16, q17, [Y]
|
||||
|
||||
ldp q6, q7, [X, #32]
|
||||
ldp q18, q19, [Y, #32]
|
||||
|
||||
fmla v16.2d, v4.2d, v0.d[0]
|
||||
fmla v17.2d, v5.2d, v0.d[0]
|
||||
|
||||
stp q16, q17, [Y]
|
||||
|
||||
ldp q20, q21, [X, #64]
|
||||
ldp q24, q25, [Y, #64]
|
||||
|
||||
fmla v18.2d, v6.2d, v0.d[0]
|
||||
fmla v19.2d, v7.2d, v0.d[0]
|
||||
|
||||
stp q18, q19, [Y, #32]
|
||||
|
||||
ldp q22, q23, [X, #96]
|
||||
ldp q26, q27, [Y, #96]
|
||||
|
||||
fmla v24.2d, v20.2d, v0.d[0]
|
||||
fmla v25.2d, v21.2d, v0.d[0]
|
||||
|
||||
stp q24, q25, [Y, #64]
|
||||
|
||||
fmla v26.2d, v22.2d, v0.d[0]
|
||||
fmla v27.2d, v23.2d, v0.d[0]
|
||||
|
||||
stp q26, q27, [Y, #96]
|
||||
|
||||
add Y, Y, #128
|
||||
add X, X, #128
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F32
|
||||
KERNEL_F16
|
||||
KERNEL_F16
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNEL_F32_L1CACHE
|
||||
KERNEL_F16_L1CACHE
|
||||
KERNEL_F16_L1CACHE
|
||||
.endm
|
||||
|
||||
.macro INIT_S
|
||||
lsl INC_X, INC_X, #3
|
||||
lsl INC_Y, INC_Y, #3
|
||||
|
@ -138,6 +185,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
cmp I, xzr
|
||||
beq .Ldaxpy_kernel_F1
|
||||
|
||||
cmp N, #2048
|
||||
ble .Ldaxpy_kernel_F32_L1CACHE
|
||||
|
||||
.align 5
|
||||
.Ldaxpy_kernel_F32:
|
||||
|
||||
|
@ -145,6 +195,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
subs I, I, #1
|
||||
bne .Ldaxpy_kernel_F32
|
||||
b .Ldaxpy_kernel_F1
|
||||
|
||||
.align 5
|
||||
.Ldaxpy_kernel_F32_L1CACHE:
|
||||
|
||||
KERNEL_F32_L1CACHE
|
||||
|
||||
subs I, I, #1
|
||||
bne .Ldaxpy_kernel_F32_L1CACHE
|
||||
|
||||
.Ldaxpy_kernel_F1:
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,562 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define M x0
|
||||
#define N x1
|
||||
#define A00 x2
|
||||
#define LDA x3
|
||||
#define B00 x4
|
||||
|
||||
#define A01 x5
|
||||
#define A02 x6
|
||||
#define A03 x7
|
||||
#define A04 x8
|
||||
#define A05 x9
|
||||
#define A06 x10
|
||||
#define A07 x11
|
||||
#define A08 x12
|
||||
|
||||
#define I x13
|
||||
#define J x14
|
||||
#define K x15
|
||||
|
||||
#define TEMP1 x16
|
||||
#define TEMP2 x17
|
||||
|
||||
/**************************************************************************************
|
||||
* Macro definitions
|
||||
**************************************************************************************/
|
||||
|
||||
.macro SAVE_REGS
|
||||
add sp, sp, #-(11 * 16)
|
||||
stp d8, d9, [sp, #(0 * 16)]
|
||||
stp d10, d11, [sp, #(1 * 16)]
|
||||
stp d12, d13, [sp, #(2 * 16)]
|
||||
stp d14, d15, [sp, #(3 * 16)]
|
||||
stp d16, d17, [sp, #(4 * 16)]
|
||||
stp x18, x19, [sp, #(5 * 16)]
|
||||
stp x20, x21, [sp, #(6 * 16)]
|
||||
stp x22, x23, [sp, #(7 * 16)]
|
||||
stp x24, x25, [sp, #(8 * 16)]
|
||||
stp x26, x27, [sp, #(9 * 16)]
|
||||
str x28, [sp, #(10 * 16)]
|
||||
.endm
|
||||
|
||||
.macro RESTORE_REGS
|
||||
ldp d8, d9, [sp, #(0 * 16)]
|
||||
ldp d10, d11, [sp, #(1 * 16)]
|
||||
ldp d12, d13, [sp, #(2 * 16)]
|
||||
ldp d14, d15, [sp, #(3 * 16)]
|
||||
ldp d16, d17, [sp, #(4 * 16)]
|
||||
ldp x18, x19, [sp, #(5 * 16)]
|
||||
ldp x20, x21, [sp, #(6 * 16)]
|
||||
ldp x22, x23, [sp, #(7 * 16)]
|
||||
ldp x24, x25, [sp, #(8 * 16)]
|
||||
ldp x26, x27, [sp, #(9 * 16)]
|
||||
ldr x28, [sp, #(10 * 16)]
|
||||
add sp, sp, #(11*16)
|
||||
.endm
|
||||
|
||||
.macro COPY4x8
|
||||
ldr q0, [A01], #16
|
||||
ldr q1, [A02], #16
|
||||
ins v8.s[0], v0.s[0]
|
||||
ins v10.s[0], v0.s[1]
|
||||
ins v12.s[0], v0.s[2]
|
||||
ins v14.s[0], v0.s[3]
|
||||
ins v8.s[1], v1.s[0]
|
||||
ins v10.s[1], v1.s[1]
|
||||
ins v12.s[1], v1.s[2]
|
||||
ins v14.s[1], v1.s[3]
|
||||
|
||||
ldr q2, [A03], #16
|
||||
ldr q3, [A04], #16
|
||||
ins v8.s[2], v2.s[0]
|
||||
ins v10.s[2], v2.s[1]
|
||||
ins v12.s[2], v2.s[2]
|
||||
ins v14.s[2], v2.s[3]
|
||||
ins v8.s[3], v3.s[0]
|
||||
ins v10.s[3], v3.s[1]
|
||||
ins v12.s[3], v3.s[2]
|
||||
ins v14.s[3], v3.s[3]
|
||||
|
||||
ldr q4, [A05], #16
|
||||
ldr q5, [A06], #16
|
||||
ins v9.s[0], v4.s[0]
|
||||
ins v11.s[0], v4.s[1]
|
||||
ins v13.s[0], v4.s[2]
|
||||
ins v15.s[0], v4.s[3]
|
||||
ins v9.s[1], v5.s[0]
|
||||
ins v11.s[1], v5.s[1]
|
||||
ins v13.s[1], v5.s[2]
|
||||
ins v15.s[1], v5.s[3]
|
||||
|
||||
ldr q6, [A07], #16
|
||||
ldr q7, [A08], #16
|
||||
ins v9.s[2], v6.s[0]
|
||||
ins v11.s[2], v6.s[1]
|
||||
ins v13.s[2], v6.s[2]
|
||||
ins v15.s[2], v6.s[3]
|
||||
ins v9.s[3], v7.s[0]
|
||||
ins v11.s[3], v7.s[1]
|
||||
ins v13.s[3], v7.s[2]
|
||||
ins v15.s[3], v7.s[3]
|
||||
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00], #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [B00], #64
|
||||
.endm
|
||||
|
||||
.macro COPY2x8
|
||||
ldr d0, [A01], #8
|
||||
ldr d1, [A02], #8
|
||||
ins v8.s[0], v0.s[0]
|
||||
ins v10.s[0], v0.s[1]
|
||||
ins v8.s[1], v1.s[0]
|
||||
ins v10.s[1], v1.s[1]
|
||||
|
||||
ldr d2, [A03], #8
|
||||
ldr d3, [A04], #8
|
||||
ins v8.s[2], v2.s[0]
|
||||
ins v10.s[2], v2.s[1]
|
||||
ins v8.s[3], v3.s[0]
|
||||
ins v10.s[3], v3.s[1]
|
||||
|
||||
ldr d4, [A05], #8
|
||||
ldr d5, [A06], #8
|
||||
ins v9.s[0], v4.s[0]
|
||||
ins v11.s[0], v4.s[1]
|
||||
ins v9.s[1], v5.s[0]
|
||||
ins v11.s[1], v5.s[1]
|
||||
|
||||
ldr d6, [A07], #8
|
||||
ldr d7, [A08], #8
|
||||
ins v9.s[2], v6.s[0]
|
||||
ins v11.s[2], v6.s[1]
|
||||
ins v9.s[3], v7.s[0]
|
||||
ins v11.s[3], v7.s[1]
|
||||
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00], #64
|
||||
.endm
|
||||
|
||||
.macro COPY1x8
|
||||
ldr s0, [A01], #4
|
||||
ldr s1, [A02], #4
|
||||
ins v8.s[0], v0.s[0]
|
||||
ins v8.s[1], v1.s[0]
|
||||
|
||||
ldr s2, [A03], #4
|
||||
ldr s3, [A04], #4
|
||||
ins v8.s[2], v2.s[0]
|
||||
ins v8.s[3], v3.s[0]
|
||||
|
||||
ldr s4, [A05], #4
|
||||
ldr s5, [A06], #4
|
||||
ins v9.s[0], v4.s[0]
|
||||
ins v9.s[1], v5.s[0]
|
||||
|
||||
ldr s6, [A07], #4
|
||||
ldr s7, [A08], #4
|
||||
ins v9.s[2], v6.s[0]
|
||||
ins v9.s[3], v7.s[0]
|
||||
|
||||
st1 {v8.4s, v9.4s}, [B00], #32
|
||||
.endm
|
||||
|
||||
.macro COPY4x4
|
||||
ldr q0, [A01], #16
|
||||
ldr q1, [A02], #16
|
||||
ins v8.s[0], v0.s[0]
|
||||
ins v9.s[0], v0.s[1]
|
||||
ins v10.s[0], v0.s[2]
|
||||
ins v11.s[0], v0.s[3]
|
||||
ins v8.s[1], v1.s[0]
|
||||
ins v9.s[1], v1.s[1]
|
||||
ins v10.s[1], v1.s[2]
|
||||
ins v11.s[1], v1.s[3]
|
||||
|
||||
ldr q2, [A03], #16
|
||||
ldr q3, [A04], #16
|
||||
ins v8.s[2], v2.s[0]
|
||||
ins v9.s[2], v2.s[1]
|
||||
ins v10.s[2], v2.s[2]
|
||||
ins v11.s[2], v2.s[3]
|
||||
ins v8.s[3], v3.s[0]
|
||||
ins v9.s[3], v3.s[1]
|
||||
ins v10.s[3], v3.s[2]
|
||||
ins v11.s[3], v3.s[3]
|
||||
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00], #64
|
||||
.endm
|
||||
|
||||
.macro COPY2x4
|
||||
ldr d0, [A01], #8
|
||||
ldr d1, [A02], #8
|
||||
ins v8.s[0], v0.s[0]
|
||||
ins v9.s[0], v0.s[1]
|
||||
ins v8.s[1], v1.s[0]
|
||||
ins v9.s[1], v1.s[1]
|
||||
|
||||
ldr d2, [A03], #8
|
||||
ldr d3, [A04], #8
|
||||
ins v8.s[2], v2.s[0]
|
||||
ins v9.s[2], v2.s[1]
|
||||
ins v8.s[3], v3.s[0]
|
||||
ins v9.s[3], v3.s[1]
|
||||
|
||||
st1 {v8.4s, v9.4s}, [B00], #32
|
||||
.endm
|
||||
|
||||
.macro COPY1x4
|
||||
ldr s0, [A01], #4
|
||||
ldr s1, [A02], #4
|
||||
ins v8.s[0], v0.s[0]
|
||||
ins v8.s[1], v1.s[0]
|
||||
|
||||
ldr s2, [A03], #4
|
||||
ldr s3, [A04], #4
|
||||
ins v8.s[2], v2.s[0]
|
||||
ins v8.s[3], v3.s[0]
|
||||
|
||||
st1 {v8.4s}, [B00], #16
|
||||
.endm
|
||||
|
||||
.macro COPY4x2
|
||||
ldr q0, [A01], #16
|
||||
ldr q1, [A02], #16
|
||||
ins v8.s[0], v0.s[0]
|
||||
ins v9.s[0], v0.s[1]
|
||||
ins v10.s[0], v0.s[2]
|
||||
ins v11.s[0], v0.s[3]
|
||||
ins v8.s[1], v1.s[0]
|
||||
ins v9.s[1], v1.s[1]
|
||||
ins v10.s[1], v1.s[2]
|
||||
ins v11.s[1], v1.s[3]
|
||||
|
||||
st1 {v8.2s, v9.2s, v10.2s, v11.2s}, [B00], #32
|
||||
.endm
|
||||
|
||||
.macro COPY2x2
|
||||
ldr d0, [A01], #8
|
||||
ldr d1, [A02], #8
|
||||
ins v8.s[0], v0.s[0]
|
||||
ins v9.s[0], v0.s[1]
|
||||
ins v8.s[1], v1.s[0]
|
||||
ins v9.s[1], v1.s[1]
|
||||
|
||||
st1 {v8.2s, v9.2s}, [B00], #16
|
||||
.endm
|
||||
|
||||
.macro COPY1x2
|
||||
ldr s0, [A01], #4
|
||||
ldr s1, [A02], #4
|
||||
ins v8.s[0], v0.s[0]
|
||||
ins v8.s[1], v1.s[0]
|
||||
|
||||
st1 {v8.2s}, [B00], #8
|
||||
.endm
|
||||
|
||||
.macro COPY1x1
|
||||
ldr s0, [A01], #4
|
||||
str s0, [B00], #4
|
||||
.endm
|
||||
|
||||
/**************************************************************************************
|
||||
* End of macro definitions
|
||||
**************************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
.align 5
|
||||
|
||||
SAVE_REGS
|
||||
|
||||
lsl LDA, LDA, #2 // LDA = LDA * SIZE
|
||||
|
||||
.Lsgemm_ncopy_L8_BEGIN:
|
||||
|
||||
asr J, N, #3 // J = N / 8
|
||||
cmp J, #0
|
||||
ble .Lsgemm_ncopy_L4_BEGIN
|
||||
|
||||
.align 5
|
||||
.Lsgemm_ncopy_L8_M4_BEGIN:
|
||||
|
||||
mov A01, A00
|
||||
add A02, A01, LDA
|
||||
add A03, A02, LDA
|
||||
add A04, A03, LDA
|
||||
add A05, A04, LDA
|
||||
add A06, A05, LDA
|
||||
add A07, A06, LDA
|
||||
add A08, A07, LDA
|
||||
add A00, A08, LDA
|
||||
|
||||
asr I, M, #2 // I = M / 4
|
||||
cmp I, #0
|
||||
ble .Lsgemm_ncopy_L8_M4_40
|
||||
|
||||
asr K, M, #4 // K = M / 16(cacheline)
|
||||
mov TEMP1, A01
|
||||
|
||||
.align 5
|
||||
.Lsgemm_tcopy_L8_warnup_1:
|
||||
|
||||
ldr s0, [TEMP1], #64
|
||||
|
||||
subs K, K, #1
|
||||
bgt .Lsgemm_tcopy_L8_warnup_1
|
||||
|
||||
asr K, M, #4 // K = M / 16(cacheline)
|
||||
mov TEMP1, A02
|
||||
|
||||
.align 5
|
||||
.Lsgemm_tcopy_L8_warnup_2:
|
||||
|
||||
ldr s0, [TEMP1], #64
|
||||
|
||||
subs K, K, #1
|
||||
bgt .Lsgemm_tcopy_L8_warnup_2
|
||||
|
||||
asr K, M, #4 // K = M / 16(cacheline)
|
||||
mov TEMP1, A03
|
||||
|
||||
.align 5
|
||||
.Lsgemm_tcopy_L8_warnup_3:
|
||||
|
||||
ldr s0, [TEMP1], #64
|
||||
|
||||
subs K, K, #1
|
||||
bgt .Lsgemm_tcopy_L8_warnup_3
|
||||
|
||||
asr K, M, #4 // K = M / 16(cacheline)
|
||||
mov TEMP1, A04
|
||||
|
||||
.align 5
|
||||
.Lsgemm_tcopy_L8_warnup_4:
|
||||
|
||||
ldr s0, [TEMP1], #64
|
||||
|
||||
subs K, K, #1
|
||||
bgt .Lsgemm_tcopy_L8_warnup_4
|
||||
|
||||
asr K, M, #4 // K = M / 16(cacheline)
|
||||
mov TEMP1, A05
|
||||
|
||||
.align 5
|
||||
.Lsgemm_tcopy_L8_warnup_5:
|
||||
|
||||
ldr s0, [TEMP1], #64
|
||||
|
||||
subs K, K, #1
|
||||
bgt .Lsgemm_tcopy_L8_warnup_5
|
||||
|
||||
asr K, M, #4 // K = M / 16(cacheline)
|
||||
mov TEMP1, A06
|
||||
|
||||
.align 5
|
||||
.Lsgemm_tcopy_L8_warnup_6:
|
||||
|
||||
ldr s0, [TEMP1], #64
|
||||
|
||||
subs K, K, #1
|
||||
bgt .Lsgemm_tcopy_L8_warnup_6
|
||||
|
||||
asr K, M, #4 // K = M / 16(cacheline)
|
||||
mov TEMP1, A07
|
||||
|
||||
.align 5
|
||||
.Lsgemm_tcopy_L8_warnup_7:
|
||||
|
||||
ldr s0, [TEMP1], #64
|
||||
|
||||
subs K, K, #1
|
||||
bgt .Lsgemm_tcopy_L8_warnup_7
|
||||
|
||||
asr K, M, #4 // K = M / 16(cacheline)
|
||||
mov TEMP1, A08
|
||||
|
||||
.align 5
|
||||
.Lsgemm_tcopy_L8_warnup_8:
|
||||
|
||||
ldr s0, [TEMP1], #64
|
||||
|
||||
subs K, K, #1
|
||||
bgt .Lsgemm_tcopy_L8_warnup_8
|
||||
|
||||
.align 5
|
||||
.Lsgemm_ncopy_L8_M4_20:
|
||||
|
||||
COPY4x8
|
||||
|
||||
subs I, I, #1
|
||||
bne .Lsgemm_ncopy_L8_M4_20
|
||||
|
||||
.Lsgemm_ncopy_L8_M4_40:
|
||||
|
||||
and I, M, #2
|
||||
cmp I, #0
|
||||
ble .Lsgemm_ncopy_L8_M4_60
|
||||
|
||||
COPY2x8
|
||||
|
||||
.Lsgemm_ncopy_L8_M4_60:
|
||||
|
||||
and I, M, #1
|
||||
cmp I, #0
|
||||
ble .Lsgemm_ncopy_L8_M4_END
|
||||
|
||||
COPY1x8
|
||||
|
||||
.Lsgemm_ncopy_L8_M4_END:
|
||||
|
||||
subs J , J, #1 // j--
|
||||
bne .Lsgemm_ncopy_L8_M4_BEGIN
|
||||
|
||||
/*********************************************************************************************/
|
||||
|
||||
.Lsgemm_ncopy_L4_BEGIN:
|
||||
|
||||
tst N, #7
|
||||
ble .Lsgemm_ncopy_L999
|
||||
|
||||
tst N, #4
|
||||
ble .Lsgemm_ncopy_L2_BEGIN
|
||||
|
||||
.Lsgemm_ncopy_L4_M4_BEGIN:
|
||||
mov A01, A00
|
||||
add A02, A01, LDA
|
||||
add A03, A02, LDA
|
||||
add A04, A03, LDA
|
||||
add A00, A04, LDA
|
||||
|
||||
asr I, M, #2 // I = M / 4
|
||||
cmp I, #0
|
||||
ble .Lsgemm_ncopy_L4_M4_40
|
||||
|
||||
.align 5
|
||||
.Lsgemm_ncopy_L4_M4_20:
|
||||
|
||||
COPY4x4
|
||||
|
||||
subs I, I, #1
|
||||
bne .Lsgemm_ncopy_L4_M4_20
|
||||
|
||||
.Lsgemm_ncopy_L4_M4_40:
|
||||
|
||||
and I, M, #2
|
||||
cmp I, #0
|
||||
ble .Lsgemm_ncopy_L4_M4_60
|
||||
|
||||
COPY2x4
|
||||
|
||||
.Lsgemm_ncopy_L4_M4_60:
|
||||
|
||||
and I, M, #1
|
||||
cmp I, #0
|
||||
ble .Lsgemm_ncopy_L4_M4_END
|
||||
|
||||
COPY1x4
|
||||
|
||||
.Lsgemm_ncopy_L4_M4_END:
|
||||
|
||||
|
||||
/*********************************************************************************************/
|
||||
|
||||
.Lsgemm_ncopy_L2_BEGIN:
|
||||
|
||||
tst N, #2
|
||||
ble .Lsgemm_ncopy_L1_BEGIN
|
||||
|
||||
.Lsgemm_ncopy_L2_M4_BEGIN:
|
||||
|
||||
mov A01, A00
|
||||
add A02, A01, LDA
|
||||
add A00, A02, LDA
|
||||
|
||||
asr I, M, #2 // I = M / 4
|
||||
cmp I, #0
|
||||
ble .Lsgemm_ncopy_L2_M4_40
|
||||
|
||||
.align 5
|
||||
.Lsgemm_ncopy_L2_M4_20:
|
||||
|
||||
COPY4x2
|
||||
|
||||
subs I , I , #1
|
||||
bne .Lsgemm_ncopy_L2_M4_20
|
||||
|
||||
|
||||
.Lsgemm_ncopy_L2_M4_40:
|
||||
|
||||
and I, M, #2
|
||||
cmp I, #0
|
||||
ble .Lsgemm_ncopy_L2_M4_60
|
||||
|
||||
COPY2x2
|
||||
|
||||
.Lsgemm_ncopy_L2_M4_60:
|
||||
|
||||
and I, M, #1
|
||||
cmp I, #0
|
||||
ble .Lsgemm_ncopy_L2_M4_END
|
||||
|
||||
COPY1x2
|
||||
|
||||
.Lsgemm_ncopy_L2_M4_END:
|
||||
|
||||
.Lsgemm_ncopy_L1_BEGIN:
|
||||
|
||||
tst N, #1
|
||||
ble .Lsgemm_ncopy_L999
|
||||
|
||||
.Lsgemm_ncopy_L1_M1_BEGIN:
|
||||
|
||||
mov A01, A00
|
||||
|
||||
mov I, M
|
||||
cmp I, #0
|
||||
ble .Lsgemm_ncopy_L1_M1_END
|
||||
|
||||
.align 5
|
||||
.Lsgemm_ncopy_L1_M1_20:
|
||||
|
||||
COPY1x1
|
||||
|
||||
subs I, I, #1
|
||||
bne .Lsgemm_ncopy_L1_M1_20
|
||||
|
||||
.Lsgemm_ncopy_L1_M1_END:
|
||||
|
||||
.Lsgemm_ncopy_L999:
|
||||
|
||||
mov x0, #0
|
||||
RESTORE_REGS
|
||||
ret
|
||||
|
||||
EPILOGUE
|
|
@ -0,0 +1,707 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define M x0
|
||||
#define N x1
|
||||
#define A x2
|
||||
#define LDA x3
|
||||
#define B x4
|
||||
|
||||
#define M8 x5
|
||||
|
||||
#define A01 x6
|
||||
#define A02 x7
|
||||
#define A03 x8
|
||||
#define A04 x9
|
||||
#define A05 x10
|
||||
#define A06 x11
|
||||
#define A07 x12
|
||||
#define A08 x13
|
||||
|
||||
#define B01 x14
|
||||
#define B02 x15
|
||||
#define B03 x16
|
||||
#define B04 x17
|
||||
#define B00 x22
|
||||
|
||||
|
||||
#define I x18
|
||||
#define J x19
|
||||
|
||||
#define TEMP1 x20
|
||||
|
||||
#define A_PREFETCH 256
|
||||
|
||||
/**************************************************************************************
|
||||
* Macro definitions
|
||||
**************************************************************************************/
|
||||
.macro SAVE_REGS
|
||||
add sp, sp, #-(11 * 16)
|
||||
stp d8, d9, [sp, #(0 * 16)]
|
||||
stp d10, d11, [sp, #(1 * 16)]
|
||||
stp d12, d13, [sp, #(2 * 16)]
|
||||
stp d14, d15, [sp, #(3 * 16)]
|
||||
stp d16, d17, [sp, #(4 * 16)]
|
||||
stp x18, x19, [sp, #(5 * 16)]
|
||||
stp x20, x21, [sp, #(6 * 16)]
|
||||
stp x22, x23, [sp, #(7 * 16)]
|
||||
stp x24, x25, [sp, #(8 * 16)]
|
||||
stp x26, x27, [sp, #(9 * 16)]
|
||||
str x28, [sp, #(10 * 16)]
|
||||
.endm
|
||||
|
||||
.macro RESTORE_REGS
|
||||
ldp d8, d9, [sp, #(0 * 16)]
|
||||
ldp d10, d11, [sp, #(1 * 16)]
|
||||
ldp d12, d13, [sp, #(2 * 16)]
|
||||
ldp d14, d15, [sp, #(3 * 16)]
|
||||
ldp d16, d17, [sp, #(4 * 16)]
|
||||
ldp x18, x19, [sp, #(5 * 16)]
|
||||
ldp x20, x21, [sp, #(6 * 16)]
|
||||
ldp x22, x23, [sp, #(7 * 16)]
|
||||
ldp x24, x25, [sp, #(8 * 16)]
|
||||
ldp x26, x27, [sp, #(9 * 16)]
|
||||
ldr x28, [sp, #(10 * 16)]
|
||||
add sp, sp, #(11*16)
|
||||
.endm
|
||||
|
||||
/*************************************************************************************************************************/
|
||||
|
||||
.macro COPY8x8
|
||||
prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A05, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A06, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A07, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A08, #A_PREFETCH]
|
||||
|
||||
ldp q0, q1, [A01]
|
||||
ldp q2, q3, [A02]
|
||||
add A01, A01, #32
|
||||
add A02, A02, #32
|
||||
|
||||
st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
|
||||
add TEMP1, B00, #64
|
||||
|
||||
ldp q4, q5, [A03]
|
||||
ldp q6, q7, [A04]
|
||||
add A03, A03, #32
|
||||
add A04, A04, #32
|
||||
|
||||
st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1]
|
||||
add TEMP1, TEMP1, #64
|
||||
|
||||
ldp q8, q9, [A05]
|
||||
ldp q10, q11, [A06]
|
||||
add A05, A05, #32
|
||||
add A06, A06, #32
|
||||
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1]
|
||||
add TEMP1, TEMP1, #64
|
||||
|
||||
ldp q12, q13, [A07]
|
||||
ldp q14, q15, [A08]
|
||||
add A07, A07, #32
|
||||
add A08, A08, #32
|
||||
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1]
|
||||
add TEMP1, TEMP1, #64
|
||||
|
||||
add B00, B00, M8
|
||||
.endm
|
||||
|
||||
.macro COPY4x8
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A05, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A06, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A07, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A08, #A_PREFETCH]
|
||||
|
||||
ldr q0, [A01]
|
||||
ldr q1, [A02]
|
||||
ldr q2, [A03]
|
||||
ldr q3, [A04]
|
||||
add A01, A01, #16
|
||||
add A02, A02, #16
|
||||
add A03, A03, #16
|
||||
add A04, A04, #16
|
||||
|
||||
st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01]
|
||||
add B01, B01, #64
|
||||
|
||||
ldr q4, [A05]
|
||||
ldr q5, [A06]
|
||||
ldr q6, [A07]
|
||||
ldr q7, [A08]
|
||||
|
||||
add A05, A05, #16
|
||||
add A06, A06, #16
|
||||
add A07, A07, #16
|
||||
add A08, A08, #16
|
||||
|
||||
st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [B01]
|
||||
add B01, B01, #64
|
||||
.endm
|
||||
|
||||
.macro COPY2x8
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A05, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A06, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A07, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A08, #A_PREFETCH]
|
||||
|
||||
ldr d0, [A01]
|
||||
ldr d1, [A02]
|
||||
ldr d2, [A03]
|
||||
ldr d3, [A04]
|
||||
|
||||
add A01, A01, #8
|
||||
add A02, A02, #8
|
||||
add A03, A03, #8
|
||||
add A04, A04, #8
|
||||
|
||||
stp d0, d1, [B02]
|
||||
add B02, B02, #16
|
||||
stp d2, d3, [B02]
|
||||
add B02, B02, #16
|
||||
|
||||
ldr d4, [A05]
|
||||
ldr d5, [A06]
|
||||
ldr d6, [A07]
|
||||
ldr d7, [A08]
|
||||
|
||||
add A05, A05, #8
|
||||
add A06, A06, #8
|
||||
add A07, A07, #8
|
||||
add A08, A08, #8
|
||||
|
||||
stp d4, d5, [B02]
|
||||
add B02, B02, #16
|
||||
stp d6, d7, [B02]
|
||||
add B02, B02, #16
|
||||
|
||||
.endm
|
||||
|
||||
.macro COPY1x8
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A05, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A06, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A07, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A08, #A_PREFETCH]
|
||||
|
||||
ldr s0, [A01]
|
||||
ldr s1, [A02]
|
||||
ldr s2, [A03]
|
||||
ldr s3, [A04]
|
||||
|
||||
add A01, A01, #4
|
||||
add A02, A02, #4
|
||||
add A03, A03, #4
|
||||
add A04, A04, #4
|
||||
|
||||
stp s0, s1, [B03]
|
||||
add B03, B03, #8
|
||||
stp s2, s3, [B03]
|
||||
add B03, B03, #8
|
||||
|
||||
ldr s4, [A05]
|
||||
ldr s5, [A06]
|
||||
ldr s6, [A07]
|
||||
ldr s7, [A08]
|
||||
|
||||
ldr d4, [A05], #8
|
||||
ldr d5, [A06], #8
|
||||
ldr d6, [A07], #8
|
||||
ldr d7, [A08], #8
|
||||
|
||||
stp s4, s5, [B03]
|
||||
add B03, B03, #8
|
||||
stp s6, s7, [B03]
|
||||
add B03, B03, #8
|
||||
|
||||
.endm
|
||||
|
||||
/*************************************************************************************************************************/
|
||||
|
||||
.macro COPY8x4
|
||||
prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||
|
||||
ldp q0, q1, [A01]
|
||||
ldp q2, q3, [A02]
|
||||
add A01, A01, #32
|
||||
add A02, A02, #32
|
||||
|
||||
st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
|
||||
add TEMP1, B00, #64
|
||||
|
||||
ldp q4, q5, [A03]
|
||||
ldp q6, q7, [A04]
|
||||
add A03, A03, #32
|
||||
add A04, A04, #32
|
||||
|
||||
st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1]
|
||||
add TEMP1, TEMP1, #64
|
||||
|
||||
add B00, B00, M8
|
||||
.endm
|
||||
|
||||
.macro COPY4x4
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||
|
||||
ldr q0, [A01]
|
||||
ldr q1, [A02]
|
||||
ldr q2, [A03]
|
||||
ldr q3, [A04]
|
||||
add A01, A01, #16
|
||||
add A02, A02, #16
|
||||
add A03, A03, #16
|
||||
add A04, A04, #16
|
||||
|
||||
st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01]
|
||||
|
||||
add B01, B01, #64
|
||||
.endm
|
||||
|
||||
.macro COPY2x4
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||
|
||||
ldr d0, [A01]
|
||||
ldr d1, [A02]
|
||||
ldr d2, [A03]
|
||||
ldr d3, [A04]
|
||||
|
||||
add A01, A01, #8
|
||||
add A02, A02, #8
|
||||
add A03, A03, #8
|
||||
add A04, A04, #8
|
||||
|
||||
stp d0, d1, [B02]
|
||||
add B02, B02, #16
|
||||
stp d2, d3, [B02]
|
||||
|
||||
add B02, B02, #16
|
||||
.endm
|
||||
|
||||
.macro COPY1x4
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||
|
||||
ldr s0, [A01]
|
||||
ldr s1, [A02]
|
||||
ldr s2, [A03]
|
||||
ldr s3, [A04]
|
||||
|
||||
add A01, A01, #4
|
||||
add A02, A02, #4
|
||||
add A03, A03, #4
|
||||
add A04, A04, #4
|
||||
|
||||
stp s0, s1, [B03]
|
||||
add B03, B03, #8
|
||||
stp s2, s3, [B03]
|
||||
add B03, B03, #8
|
||||
|
||||
.endm
|
||||
|
||||
/*************************************************************************************************************************/
|
||||
|
||||
.macro COPY8x2
|
||||
prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
|
||||
ld1 {v0.4s, v1.4s}, [A01]
|
||||
ld1 {v2.4s, v3.4s}, [A02]
|
||||
add A01, A01, #32
|
||||
add A02, A02, #32
|
||||
|
||||
st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
|
||||
add B00, B00, M8
|
||||
.endm
|
||||
|
||||
.macro COPY4x2
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
|
||||
ldr q0, [A01]
|
||||
ldr q1, [A02]
|
||||
add A01, A01, #16
|
||||
add A02, A02, #16
|
||||
|
||||
stp q0, q1, [B01]
|
||||
add B01, B01, #32
|
||||
.endm
|
||||
|
||||
.macro COPY2x2
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
|
||||
ldr d0, [A01]
|
||||
ldr d1, [A02]
|
||||
|
||||
add A01, A01, #8
|
||||
add A02, A02, #8
|
||||
|
||||
stp d0, d1, [B02]
|
||||
add B02, B02, #16
|
||||
.endm
|
||||
|
||||
.macro COPY1x2
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
|
||||
ldr s0, [A01]
|
||||
ldr s1, [A02]
|
||||
|
||||
add A01, A01, #4
|
||||
add A02, A02, #4
|
||||
|
||||
stp s0, s1, [B03]
|
||||
|
||||
add B03, B03, #8
|
||||
.endm
|
||||
|
||||
/*************************************************************************************************************************/
|
||||
|
||||
.macro COPY8x1
|
||||
prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
|
||||
ldp q0, q1, [A01]
|
||||
add A01, A01, #32
|
||||
stp q0, q1, [B00]
|
||||
|
||||
add B00, B00, M8
|
||||
.endm
|
||||
|
||||
.macro COPY4x1
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
|
||||
ldr q0, [A01]
|
||||
add A01, A01, #16
|
||||
str q0, [B01]
|
||||
|
||||
add B01, B01, #16
|
||||
.endm
|
||||
|
||||
.macro COPY2x1
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
|
||||
ldr d0, [A01]
|
||||
add A01, A01, #8
|
||||
str d0, [B02]
|
||||
|
||||
add B02, B02, #8
|
||||
.endm
|
||||
|
||||
.macro COPY1x1
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
|
||||
ldr s0, [A01]
|
||||
add A01, A01, #4
|
||||
str s0, [B03]
|
||||
|
||||
add B03, B03, #4
|
||||
.endm
|
||||
|
||||
/**************************************************************************************
|
||||
* End of macro definitions
|
||||
**************************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
.align 5
|
||||
|
||||
SAVE_REGS
|
||||
|
||||
lsl LDA, LDA, #2 // LDA = LDA * SIZE
|
||||
|
||||
lsl TEMP1, M, #2 // TEMP1 = M * SIZE
|
||||
|
||||
and B01 , N , #-8
|
||||
and B02 , N , #-4
|
||||
and B03 , N , #-2
|
||||
|
||||
mul B01, B01, TEMP1
|
||||
mul B02, B02, TEMP1
|
||||
mul B03, B03, TEMP1
|
||||
|
||||
add B01 , B01, B
|
||||
add B02 , B02, B
|
||||
add B03 , B03, B
|
||||
|
||||
lsl M8, M, #5 // M8 = M * 8 * SIZE
|
||||
|
||||
.Lsgemm_tcopy_L8_BEGIN:
|
||||
|
||||
asr J, M, #3 // J = M / 8
|
||||
cmp J, #0
|
||||
ble .Lsgemm_tcopy_L4_BEGIN
|
||||
|
||||
.align 5
|
||||
.Lsgemm_tcopy_L8_M8_BEGIN:
|
||||
|
||||
mov A01, A
|
||||
add A02, A01, LDA
|
||||
add A03, A02, LDA
|
||||
add A04, A03, LDA
|
||||
add A05, A04, LDA
|
||||
add A06, A05, LDA
|
||||
add A07, A06, LDA
|
||||
add A08, A07, LDA
|
||||
add A, A08, LDA
|
||||
|
||||
mov B00, B
|
||||
add B, B00, #256 // B = B + 8 * 8 * SIZE
|
||||
|
||||
asr I, N, #3 // I = N / 8
|
||||
cmp I, #0
|
||||
ble .Lsgemm_tcopy_L8_M8_40
|
||||
|
||||
.align 5
|
||||
.Lsgemm_tcopy_L8_M8_20:
|
||||
|
||||
COPY8x8
|
||||
|
||||
subs I , I , #1
|
||||
bne .Lsgemm_tcopy_L8_M8_20
|
||||
|
||||
.Lsgemm_tcopy_L8_M8_40:
|
||||
|
||||
tst N , #4
|
||||
ble .Lsgemm_tcopy_L8_M8_60
|
||||
|
||||
COPY4x8
|
||||
|
||||
.Lsgemm_tcopy_L8_M8_60:
|
||||
|
||||
tst N , #2
|
||||
ble .Lsgemm_tcopy_L8_M8_80
|
||||
|
||||
COPY2x8
|
||||
|
||||
.Lsgemm_tcopy_L8_M8_80:
|
||||
|
||||
tst N, #1
|
||||
ble .Lsgemm_tcopy_L8_M8_END
|
||||
|
||||
COPY1x8
|
||||
|
||||
.Lsgemm_tcopy_L8_M8_END:
|
||||
|
||||
subs J, J, #1 // j--
|
||||
bne .Lsgemm_tcopy_L8_M8_BEGIN
|
||||
|
||||
/*********************************************************************************************/
|
||||
|
||||
.Lsgemm_tcopy_L4_BEGIN:
|
||||
|
||||
tst M, #7
|
||||
ble .Lsgemm_tcopy_L999
|
||||
|
||||
tst M, #4
|
||||
ble .Lsgemm_tcopy_L2_BEGIN
|
||||
|
||||
.Lsgemm_tcopy_L4_M8_BEGIN:
|
||||
|
||||
mov A01, A
|
||||
add A02, A01, LDA
|
||||
add A03, A02, LDA
|
||||
add A04, A03, LDA
|
||||
add A, A04, LDA
|
||||
|
||||
mov B00, B
|
||||
add B, B00, #128 // B = B + 4 * 8 * SIZE
|
||||
|
||||
asr I, N, #3 // I = N / 8
|
||||
cmp I, #0
|
||||
ble .Lsgemm_tcopy_L4_M8_40
|
||||
|
||||
.align 5
|
||||
.Lsgemm_tcopy_L4_M8_20:
|
||||
|
||||
COPY8x4
|
||||
|
||||
subs I , I , #1
|
||||
bne .Lsgemm_tcopy_L4_M8_20
|
||||
|
||||
.Lsgemm_tcopy_L4_M8_40:
|
||||
|
||||
tst N , #4
|
||||
ble .Lsgemm_tcopy_L4_M8_60
|
||||
|
||||
COPY4x4
|
||||
|
||||
.Lsgemm_tcopy_L4_M8_60:
|
||||
|
||||
tst N , #2
|
||||
ble .Lsgemm_tcopy_L4_M8_80
|
||||
|
||||
COPY2x4
|
||||
|
||||
.Lsgemm_tcopy_L4_M8_80:
|
||||
|
||||
tst N , #1
|
||||
ble .Lsgemm_tcopy_L4_M8_END
|
||||
|
||||
COPY1x4
|
||||
|
||||
|
||||
.Lsgemm_tcopy_L4_M8_END:
|
||||
|
||||
/*********************************************************************************************/
|
||||
|
||||
.Lsgemm_tcopy_L2_BEGIN:
|
||||
|
||||
tst M, #3
|
||||
ble .Lsgemm_tcopy_L999
|
||||
|
||||
tst M, #2
|
||||
ble .Lsgemm_tcopy_L1_BEGIN
|
||||
|
||||
.Lsgemm_tcopy_L2_M16_BEGIN:
|
||||
|
||||
mov A01, A
|
||||
add A02, A01, LDA
|
||||
add A, A02, LDA
|
||||
|
||||
mov B00, B
|
||||
add B, B00, #64 // B = B + 2 * 8 * SIZE
|
||||
|
||||
asr I, N, #3 // I = N / 8
|
||||
cmp I, #0
|
||||
ble .Lsgemm_tcopy_L2_M8_40
|
||||
|
||||
.align 5
|
||||
.Lsgemm_tcopy_L2_M8_20:
|
||||
|
||||
COPY8x2
|
||||
|
||||
subs I , I , #1
|
||||
bne .Lsgemm_tcopy_L2_M8_20
|
||||
|
||||
.Lsgemm_tcopy_L2_M8_40:
|
||||
|
||||
tst N , #4
|
||||
ble .Lsgemm_tcopy_L2_M8_60
|
||||
|
||||
COPY4x2
|
||||
|
||||
.Lsgemm_tcopy_L2_M8_60:
|
||||
|
||||
tst N , #2
|
||||
ble .Lsgemm_tcopy_L2_M8_80
|
||||
|
||||
COPY2x2
|
||||
|
||||
.Lsgemm_tcopy_L2_M8_80:
|
||||
|
||||
tst N , #1
|
||||
ble .Lsgemm_tcopy_L2_M8_END
|
||||
|
||||
COPY1x2
|
||||
|
||||
.Lsgemm_tcopy_L2_M8_END:
|
||||
|
||||
/*********************************************************************************************/
|
||||
|
||||
.Lsgemm_tcopy_L1_BEGIN:
|
||||
|
||||
tst M, #1
|
||||
ble .Lsgemm_tcopy_L999
|
||||
|
||||
|
||||
.Lsgemm_tcopy_L1_M16_BEGIN:
|
||||
|
||||
mov A01, A // A01 = A
|
||||
mov B00, B
|
||||
|
||||
asr I, N, #3 // I = M / 8
|
||||
cmp I, #0
|
||||
ble .Lsgemm_tcopy_L1_M8_40
|
||||
|
||||
.align 5
|
||||
.Lsgemm_tcopy_L1_M8_20:
|
||||
|
||||
COPY8x1
|
||||
|
||||
subs I , I , #1
|
||||
bne .Lsgemm_tcopy_L1_M8_20
|
||||
|
||||
.Lsgemm_tcopy_L1_M8_40:
|
||||
|
||||
tst N , #4
|
||||
ble .Lsgemm_tcopy_L1_M8_60
|
||||
|
||||
COPY4x1
|
||||
|
||||
.Lsgemm_tcopy_L1_M8_60:
|
||||
|
||||
tst N , #2
|
||||
ble .Lsgemm_tcopy_L1_M8_80
|
||||
|
||||
COPY2x1
|
||||
|
||||
.Lsgemm_tcopy_L1_M8_80:
|
||||
|
||||
tst N , #1
|
||||
ble .Lsgemm_tcopy_L1_M8_END
|
||||
|
||||
COPY1x1
|
||||
|
||||
|
||||
.Lsgemm_tcopy_L1_M8_END:
|
||||
|
||||
.Lsgemm_tcopy_L999:
|
||||
|
||||
mov x0, #0 // set return value
|
||||
RESTORE_REGS
|
||||
ret
|
||||
|
||||
EPILOGUE
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -39,24 +39,24 @@
|
|||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||
BLASLONG i, j;
|
||||
|
||||
FLOAT *aoffset;
|
||||
FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
|
||||
FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
|
||||
FLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12;
|
||||
FLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16;
|
||||
IFLOAT *aoffset;
|
||||
IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
|
||||
IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
|
||||
IFLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12;
|
||||
IFLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16;
|
||||
|
||||
FLOAT *boffset;
|
||||
FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
|
||||
FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
|
||||
FLOAT ctemp09, ctemp10, ctemp11, ctemp12;
|
||||
FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
|
||||
FLOAT ctemp17, ctemp18, ctemp19, ctemp20;
|
||||
FLOAT ctemp21, ctemp22, ctemp23, ctemp24;
|
||||
FLOAT ctemp25, ctemp26, ctemp27, ctemp28;
|
||||
FLOAT ctemp29, ctemp30, ctemp31, ctemp32;
|
||||
IFLOAT *boffset;
|
||||
IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
|
||||
IFLOAT ctemp05, ctemp06, ctemp07, ctemp08;
|
||||
IFLOAT ctemp09, ctemp10, ctemp11, ctemp12;
|
||||
IFLOAT ctemp13, ctemp14, ctemp15, ctemp16;
|
||||
IFLOAT ctemp17, ctemp18, ctemp19, ctemp20;
|
||||
IFLOAT ctemp21, ctemp22, ctemp23, ctemp24;
|
||||
IFLOAT ctemp25, ctemp26, ctemp27, ctemp28;
|
||||
IFLOAT ctemp29, ctemp30, ctemp31, ctemp32;
|
||||
|
||||
aoffset = a;
|
||||
boffset = b;
|
||||
|
|
|
@ -39,30 +39,30 @@
|
|||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||
BLASLONG i, j;
|
||||
|
||||
FLOAT *aoffset;
|
||||
FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
|
||||
FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
|
||||
IFLOAT *aoffset;
|
||||
IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
|
||||
IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
|
||||
|
||||
FLOAT *boffset;
|
||||
FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
|
||||
FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
|
||||
FLOAT ctemp09, ctemp10, ctemp11, ctemp12;
|
||||
FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
|
||||
FLOAT ctemp17, ctemp18, ctemp19, ctemp20;
|
||||
FLOAT ctemp21, ctemp22, ctemp23, ctemp24;
|
||||
FLOAT ctemp25, ctemp26, ctemp27, ctemp28;
|
||||
FLOAT ctemp29, ctemp30, ctemp31, ctemp32;
|
||||
FLOAT ctemp33, ctemp34, ctemp35, ctemp36;
|
||||
FLOAT ctemp37, ctemp38, ctemp39, ctemp40;
|
||||
FLOAT ctemp41, ctemp42, ctemp43, ctemp44;
|
||||
FLOAT ctemp45, ctemp46, ctemp47, ctemp48;
|
||||
FLOAT ctemp49, ctemp50, ctemp51, ctemp52;
|
||||
FLOAT ctemp53, ctemp54, ctemp55, ctemp56;
|
||||
FLOAT ctemp57, ctemp58, ctemp59, ctemp60;
|
||||
FLOAT ctemp61, ctemp62, ctemp63, ctemp64;
|
||||
IFLOAT *boffset;
|
||||
IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
|
||||
IFLOAT ctemp05, ctemp06, ctemp07, ctemp08;
|
||||
IFLOAT ctemp09, ctemp10, ctemp11, ctemp12;
|
||||
IFLOAT ctemp13, ctemp14, ctemp15, ctemp16;
|
||||
IFLOAT ctemp17, ctemp18, ctemp19, ctemp20;
|
||||
IFLOAT ctemp21, ctemp22, ctemp23, ctemp24;
|
||||
IFLOAT ctemp25, ctemp26, ctemp27, ctemp28;
|
||||
IFLOAT ctemp29, ctemp30, ctemp31, ctemp32;
|
||||
IFLOAT ctemp33, ctemp34, ctemp35, ctemp36;
|
||||
IFLOAT ctemp37, ctemp38, ctemp39, ctemp40;
|
||||
IFLOAT ctemp41, ctemp42, ctemp43, ctemp44;
|
||||
IFLOAT ctemp45, ctemp46, ctemp47, ctemp48;
|
||||
IFLOAT ctemp49, ctemp50, ctemp51, ctemp52;
|
||||
IFLOAT ctemp53, ctemp54, ctemp55, ctemp56;
|
||||
IFLOAT ctemp57, ctemp58, ctemp59, ctemp60;
|
||||
IFLOAT ctemp61, ctemp62, ctemp63, ctemp64;
|
||||
|
||||
|
||||
aoffset = a;
|
||||
|
|
|
@ -39,22 +39,22 @@
|
|||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||
|
||||
BLASLONG i, j;
|
||||
|
||||
FLOAT *aoffset;
|
||||
FLOAT *aoffset1, *aoffset2;
|
||||
FLOAT *boffset;
|
||||
IFLOAT *aoffset;
|
||||
IFLOAT *aoffset1, *aoffset2;
|
||||
IFLOAT *boffset;
|
||||
|
||||
FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
|
||||
FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
|
||||
FLOAT ctemp09, ctemp10, ctemp11, ctemp12;
|
||||
FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
|
||||
FLOAT ctemp17, ctemp18, ctemp19, ctemp20;
|
||||
FLOAT ctemp21, ctemp22, ctemp23, ctemp24;
|
||||
FLOAT ctemp25, ctemp26, ctemp27, ctemp28;
|
||||
FLOAT ctemp29, ctemp30, ctemp31, ctemp32;
|
||||
IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
|
||||
IFLOAT ctemp05, ctemp06, ctemp07, ctemp08;
|
||||
IFLOAT ctemp09, ctemp10, ctemp11, ctemp12;
|
||||
IFLOAT ctemp13, ctemp14, ctemp15, ctemp16;
|
||||
IFLOAT ctemp17, ctemp18, ctemp19, ctemp20;
|
||||
IFLOAT ctemp21, ctemp22, ctemp23, ctemp24;
|
||||
IFLOAT ctemp25, ctemp26, ctemp27, ctemp28;
|
||||
IFLOAT ctemp29, ctemp30, ctemp31, ctemp32;
|
||||
|
||||
aoffset = a;
|
||||
boffset = b;
|
||||
|
|
|
@ -39,32 +39,32 @@
|
|||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||
|
||||
BLASLONG i, j;
|
||||
|
||||
FLOAT *aoffset;
|
||||
FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
|
||||
FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
|
||||
IFLOAT *aoffset;
|
||||
IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
|
||||
IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
|
||||
|
||||
FLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4;
|
||||
IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4;
|
||||
|
||||
FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
|
||||
FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
|
||||
FLOAT ctemp09, ctemp10, ctemp11, ctemp12;
|
||||
FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
|
||||
FLOAT ctemp17, ctemp18, ctemp19, ctemp20;
|
||||
FLOAT ctemp21, ctemp22, ctemp23, ctemp24;
|
||||
FLOAT ctemp25, ctemp26, ctemp27, ctemp28;
|
||||
FLOAT ctemp29, ctemp30, ctemp31, ctemp32;
|
||||
FLOAT ctemp33, ctemp34, ctemp35, ctemp36;
|
||||
FLOAT ctemp37, ctemp38, ctemp39, ctemp40;
|
||||
FLOAT ctemp41, ctemp42, ctemp43, ctemp44;
|
||||
FLOAT ctemp45, ctemp46, ctemp47, ctemp48;
|
||||
FLOAT ctemp49, ctemp50, ctemp51, ctemp52;
|
||||
FLOAT ctemp53, ctemp54, ctemp55, ctemp56;
|
||||
FLOAT ctemp57, ctemp58, ctemp59, ctemp60;
|
||||
FLOAT ctemp61, ctemp62, ctemp63, ctemp64;
|
||||
IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
|
||||
IFLOAT ctemp05, ctemp06, ctemp07, ctemp08;
|
||||
IFLOAT ctemp09, ctemp10, ctemp11, ctemp12;
|
||||
IFLOAT ctemp13, ctemp14, ctemp15, ctemp16;
|
||||
IFLOAT ctemp17, ctemp18, ctemp19, ctemp20;
|
||||
IFLOAT ctemp21, ctemp22, ctemp23, ctemp24;
|
||||
IFLOAT ctemp25, ctemp26, ctemp27, ctemp28;
|
||||
IFLOAT ctemp29, ctemp30, ctemp31, ctemp32;
|
||||
IFLOAT ctemp33, ctemp34, ctemp35, ctemp36;
|
||||
IFLOAT ctemp37, ctemp38, ctemp39, ctemp40;
|
||||
IFLOAT ctemp41, ctemp42, ctemp43, ctemp44;
|
||||
IFLOAT ctemp45, ctemp46, ctemp47, ctemp48;
|
||||
IFLOAT ctemp49, ctemp50, ctemp51, ctemp52;
|
||||
IFLOAT ctemp53, ctemp54, ctemp55, ctemp56;
|
||||
IFLOAT ctemp57, ctemp58, ctemp59, ctemp60;
|
||||
IFLOAT ctemp61, ctemp62, ctemp63, ctemp64;
|
||||
|
||||
aoffset = a;
|
||||
boffset = b;
|
||||
|
|
|
@ -0,0 +1,225 @@
|
|||
ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__)
|
||||
include $(KERNELDIR)/KERNEL.POWER8
|
||||
else
|
||||
|
||||
#SGEMM_BETA = ../generic/gemm_beta.c
|
||||
#DGEMM_BETA = ../generic/gemm_beta.c
|
||||
#CGEMM_BETA = ../generic/zgemm_beta.c
|
||||
#ZGEMM_BETA = ../generic/zgemm_beta.c
|
||||
|
||||
SHGEMM_BETA = ../generic/gemm_beta.c
|
||||
SHGEMMKERNEL = shgemm_kernel_power10.c
|
||||
SHGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||
SHGEMMITCOPY = ../generic/gemm_tcopy_16.c
|
||||
SHGEMMONCOPY = ../generic/gemm_ncopy_8.c
|
||||
SHGEMMOTCOPY = ../generic/gemm_tcopy_8.c
|
||||
SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
STRMMKERNEL = sgemm_kernel_power10.c
|
||||
DTRMMKERNEL = dgemm_kernel_power10.c
|
||||
CTRMMKERNEL = cgemm_kernel_power10.S
|
||||
ZTRMMKERNEL = zgemm_kernel_power10.S
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_power10.c
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||
SGEMMITCOPY = sgemm_tcopy_16_power8.S
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_8.c
|
||||
SGEMMOTCOPY = sgemm_tcopy_8_power8.S
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_power10.c
|
||||
DGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||
DGEMMITCOPY = dgemm_tcopy_16_power8.S
|
||||
DGEMMONCOPY = dgemm_ncopy_4_power8.S
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_power10.S
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_power10.S
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
||||
ZGEMMITCOPY = zgemm_tcopy_8_power8.S
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
#Todo: CGEMM3MKERNEL should be 4x4 blocksizes.
|
||||
#CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse3.S
|
||||
#ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse3.S
|
||||
|
||||
#Pure C for other kernels
|
||||
#SAMAXKERNEL = ../arm/amax.c
|
||||
#DAMAXKERNEL = ../arm/amax.c
|
||||
#CAMAXKERNEL = ../arm/zamax.c
|
||||
#ZAMAXKERNEL = ../arm/zamax.c
|
||||
#
|
||||
#SAMINKERNEL = ../arm/amin.c
|
||||
#DAMINKERNEL = ../arm/amin.c
|
||||
#CAMINKERNEL = ../arm/zamin.c
|
||||
#ZAMINKERNEL = ../arm/zamin.c
|
||||
#
|
||||
#SMAXKERNEL = ../arm/max.c
|
||||
#DMAXKERNEL = ../arm/max.c
|
||||
#
|
||||
#SMINKERNEL = ../arm/min.c
|
||||
#DMINKERNEL = ../arm/min.c
|
||||
#
|
||||
ifneq ($(GCCVERSIONGTEQ9),1)
|
||||
ISAMAXKERNEL = isamax_power9.S
|
||||
else
|
||||
ISAMAXKERNEL = isamax.c
|
||||
endif
|
||||
IDAMAXKERNEL = idamax.c
|
||||
ifneq ($(GCCVERSIONGTEQ9),1)
|
||||
ICAMAXKERNEL = icamax_power9.S
|
||||
else
|
||||
ICAMAXKERNEL = icamax.c
|
||||
endif
|
||||
IZAMAXKERNEL = izamax.c
|
||||
#
|
||||
ifneq ($(GCCVERSIONGTEQ9),1)
|
||||
ISAMINKERNEL = isamin_power9.S
|
||||
else
|
||||
ISAMINKERNEL = isamin.c
|
||||
endif
|
||||
IDAMINKERNEL = idamin.c
|
||||
ifneq ($(GCCVERSIONGTEQ9),1)
|
||||
ICAMINKERNEL = icamin_power9.S
|
||||
else
|
||||
ICAMINKERNEL = icamin.c
|
||||
endif
|
||||
IZAMINKERNEL = izamin.c
|
||||
#
|
||||
#ISMAXKERNEL = ../arm/imax.c
|
||||
#IDMAXKERNEL = ../arm/imax.c
|
||||
#
|
||||
#ISMINKERNEL = ../arm/imin.c
|
||||
#IDMINKERNEL = ../arm/imin.c
|
||||
#
|
||||
SASUMKERNEL = sasum.c
|
||||
DASUMKERNEL = dasum.c
|
||||
CASUMKERNEL = casum.c
|
||||
ZASUMKERNEL = zasum.c
|
||||
#
|
||||
SAXPYKERNEL = saxpy.c
|
||||
DAXPYKERNEL = daxpy.c
|
||||
ifneq ($(GCCVERSIONGTEQ9),1)
|
||||
CAXPYKERNEL = caxpy_power9.S
|
||||
else
|
||||
CAXPYKERNEL = caxpy.c
|
||||
endif
|
||||
ZAXPYKERNEL = zaxpy.c
|
||||
#
|
||||
SCOPYKERNEL = scopy.c
|
||||
DCOPYKERNEL = dcopy.c
|
||||
CCOPYKERNEL = ccopy.c
|
||||
ZCOPYKERNEL = zcopy.c
|
||||
#
|
||||
SDOTKERNEL = sdot.c
|
||||
DDOTKERNEL = ddot.c
|
||||
DSDOTKERNEL = sdot.c
|
||||
ifneq ($(GCCVERSIONGTEQ9),1)
|
||||
CDOTKERNEL = cdot_power9.S
|
||||
else
|
||||
CDOTKERNEL = cdot.c
|
||||
endif
|
||||
ZDOTKERNEL = zdot.c
|
||||
#
|
||||
SNRM2KERNEL = ../arm/nrm2.c
|
||||
DNRM2KERNEL = ../arm/nrm2.c
|
||||
CNRM2KERNEL = ../arm/znrm2.c
|
||||
ZNRM2KERNEL = ../arm/znrm2.c
|
||||
#
|
||||
SROTKERNEL = srot.c
|
||||
DROTKERNEL = drot.c
|
||||
CROTKERNEL = crot.c
|
||||
ZROTKERNEL = zrot.c
|
||||
#
|
||||
SSCALKERNEL = sscal.c
|
||||
DSCALKERNEL = dscal.c
|
||||
CSCALKERNEL = zscal.c
|
||||
ZSCALKERNEL = zscal.c
|
||||
#
|
||||
SSWAPKERNEL = sswap.c
|
||||
DSWAPKERNEL = dswap.c
|
||||
CSWAPKERNEL = cswap.c
|
||||
ZSWAPKERNEL = zswap.c
|
||||
#
|
||||
|
||||
SGEMVNKERNEL = sgemv_n.c
|
||||
DGEMVNKERNEL = dgemv_n_power10.c
|
||||
CGEMVNKERNEL = cgemv_n.c
|
||||
ZGEMVNKERNEL = zgemv_n_4.c
|
||||
#
|
||||
SGEMVTKERNEL = sgemv_t.c
|
||||
DGEMVTKERNEL = dgemv_t_power10.c
|
||||
CGEMVTKERNEL = cgemv_t.c
|
||||
ZGEMVTKERNEL = zgemv_t_4.c
|
||||
|
||||
|
||||
#SSYMV_U_KERNEL = ../generic/symv_k.c
|
||||
#SSYMV_L_KERNEL = ../generic/symv_k.c
|
||||
#DSYMV_U_KERNEL = ../generic/symv_k.c
|
||||
#DSYMV_L_KERNEL = ../generic/symv_k.c
|
||||
#QSYMV_U_KERNEL = ../generic/symv_k.c
|
||||
#QSYMV_L_KERNEL = ../generic/symv_k.c
|
||||
#CSYMV_U_KERNEL = ../generic/zsymv_k.c
|
||||
#CSYMV_L_KERNEL = ../generic/zsymv_k.c
|
||||
#ZSYMV_U_KERNEL = ../generic/zsymv_k.c
|
||||
#ZSYMV_L_KERNEL = ../generic/zsymv_k.c
|
||||
#XSYMV_U_KERNEL = ../generic/zsymv_k.c
|
||||
#XSYMV_L_KERNEL = ../generic/zsymv_k.c
|
||||
|
||||
#ZHEMV_U_KERNEL = ../generic/zhemv_k.c
|
||||
#ZHEMV_L_KERNEL = ../generic/zhemv_k.c
|
||||
|
||||
LSAME_KERNEL = ../generic/lsame.c
|
||||
SCABS_KERNEL = ../generic/cabs.c
|
||||
DCABS_KERNEL = ../generic/cabs.c
|
||||
QCABS_KERNEL = ../generic/cabs.c
|
||||
|
||||
#Dump kernel
|
||||
CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
|
||||
ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
|
||||
|
||||
endif
|
|
@ -1,3 +1,44 @@
|
|||
# Big-endian 32bit (AIX) is supported through the POWER6 GEMM kernels, no separate TRMM
|
||||
ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1)
|
||||
SGEMMKERNEL = gemm_kernel_power6.S
|
||||
SGEMMINCOPY =
|
||||
SGEMMITCOPY =
|
||||
SGEMMONCOPY = gemm_ncopy_4.S
|
||||
SGEMMOTCOPY = gemm_tcopy_4.S
|
||||
SGEMMINCOPYOBJ =
|
||||
SGEMMITCOPYOBJ =
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMKERNEL = gemm_kernel_power6.S
|
||||
DGEMMINCOPY =
|
||||
DGEMMITCOPY =
|
||||
DGEMMONCOPY = gemm_ncopy_4.S
|
||||
DGEMMOTCOPY = gemm_tcopy_4.S
|
||||
DGEMMINCOPYOBJ =
|
||||
DGEMMITCOPYOBJ =
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMKERNEL = zgemm_kernel_power6.S
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_2.c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_2.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMKERNEL = zgemm_kernel_power6.S
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_2.c
|
||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_2.c
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
else
|
||||
|
||||
#SGEMM_BETA = ../generic/gemm_beta.c
|
||||
#DGEMM_BETA = ../generic/gemm_beta.c
|
||||
#CGEMM_BETA = ../generic/zgemm_beta.c
|
||||
|
@ -47,16 +88,24 @@ ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
|||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1)
|
||||
DTRSMKERNEL_LN = trsm_kernel_power6_LN.S
|
||||
DTRSMKERNEL_LT = trsm_kernel_power6_LT.S
|
||||
DTRSMKERNEL_RN = trsm_kernel_power6_LT.S
|
||||
DTRSMKERNEL_RT = trsm_kernel_power6_RT.S
|
||||
else
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
endif
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
|
@ -153,6 +202,10 @@ ZASUMKERNEL = zasum.c
|
|||
#
|
||||
SAXPYKERNEL = saxpy.c
|
||||
DAXPYKERNEL = daxpy.c
|
||||
#
|
||||
ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1)
|
||||
CAXPYKERNEL = zaxpy.S
|
||||
else
|
||||
ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__)
|
||||
ifneq ($(GCCVERSIONGTEQ9),1)
|
||||
CAXPYKERNEL = caxpy_power8.S
|
||||
|
@ -162,6 +215,7 @@ endif
|
|||
else
|
||||
CAXPYKERNEL = caxpy.c
|
||||
endif
|
||||
endif
|
||||
#
|
||||
ZAXPYKERNEL = zaxpy.c
|
||||
#
|
||||
|
@ -232,3 +286,10 @@ QCABS_KERNEL = ../generic/cabs.c
|
|||
#Dump kernel
|
||||
CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
|
||||
ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
|
||||
|
||||
ifeq ($(__BYTE_ORDER__)$(ELF_VERSION),__ORDER_BIG_ENDIAN__2)
|
||||
IDAMAXKERNEL = ../arm/iamax.c
|
||||
IDAMINKERNEL = ../arm/iamin.c
|
||||
IZAMAXKERNEL = ../arm/izamax.c
|
||||
IZAMINKERNEL = ../arm/izamin.c
|
||||
endif
|
||||
|
|
|
@ -20,8 +20,10 @@ ZAXPYKERNEL = zaxpy_ppc440.S
|
|||
|
||||
SDOTKERNEL = dot_ppc440.S
|
||||
DDOTKERNEL = dot_ppc440.S
|
||||
CDOTKERNEL = zdot_ppc440.S
|
||||
ZDOTKERNEL = zdot_ppc440.S
|
||||
#CDOTKERNEL = zdot_ppc440.S
|
||||
#ZDOTKERNEL = zdot_ppc440.S
|
||||
CDOTKERNEL = ../arm/zdot.c
|
||||
ZDOTKERNEL = ../arm/zdot.c
|
||||
|
||||
ISAMAXKERNEL = iamax_ppc440.S
|
||||
IDAMAXKERNEL = iamax_ppc440.S
|
||||
|
@ -52,8 +54,11 @@ ZNRM2KERNEL = znrm2_ppc440.S
|
|||
|
||||
SROTKERNEL = rot_ppc440.S
|
||||
DROTKERNEL = rot_ppc440.S
|
||||
CROTKERNEL = zrot_ppc440.S
|
||||
ZROTKERNEL = zrot_ppc440.S
|
||||
#CROTKERNEL = zrot_ppc440.S
|
||||
#ZROTKERNEL = zrot_ppc440.S
|
||||
CROTKERNEL = ../arm/zrot.c
|
||||
ZROTKERNEL = ../arm/zrot.c
|
||||
|
||||
|
||||
SSCALKERNEL = scal_ppc440.S
|
||||
DSCALKERNEL = scal_ppc440.S
|
||||
|
@ -78,13 +83,18 @@ DGEMMINCOPYOBJ =
|
|||
DGEMMITCOPYOBJ =
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMKERNEL = zgemm_kernel_altivec_g4.S
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
|
||||
#CGEMMKERNEL = zgemm_kernel_altivec_g4.S
|
||||
#CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
||||
#CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
|
||||
CGEMMKERNEL = zgemm_kernel.S
|
||||
CGEMMINCOPY =
|
||||
CGEMMONCOPY =
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMINCOPYOBJ =
|
||||
#cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ =
|
||||
#cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMKERNEL = zgemm_kernel_g4.S
|
||||
|
|
|
@ -46,9 +46,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#endif
|
||||
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "casum_microk_power8.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_16
|
||||
|
|
|
@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "ccopy_microk_power8.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_KERNEL_32
|
||||
|
||||
|
|
|
@ -23,6 +23,9 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
#if !defined(__VEC__) || !defined(__ALTIVEC__)
|
||||
#include "../arm/zdot.c"
|
||||
#else
|
||||
|
||||
#include "common.h"
|
||||
#ifndef HAVE_KERNEL_8
|
||||
|
@ -168,3 +171,4 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
|
|||
return (result);
|
||||
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -424,7 +424,7 @@ L999:
|
|||
lwz r16, 204(SP)
|
||||
lwz r15, 208(SP)
|
||||
lwz r14, 212(SP)
|
||||
addi r11, 224
|
||||
addi r11, SP, 224
|
||||
#endif
|
||||
lvx v20, r11, r0
|
||||
addi r11, r11, 16
|
||||
|
@ -459,4 +459,4 @@ L999:
|
|||
blr
|
||||
|
||||
EPILOGUE
|
||||
#endif^
|
||||
#endif
|
||||
|
|
|
@ -0,0 +1,286 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
#include "def_vsx.h"
|
||||
|
||||
|
||||
#define LOAD ld
|
||||
#define STACKSIZE (512 )
|
||||
#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */
|
||||
#define M r3
|
||||
#define N r4
|
||||
#define K r5
|
||||
|
||||
|
||||
#define A r8
|
||||
#define B r9
|
||||
#define C r10
|
||||
#define LDC r6
|
||||
#define OFFSET r7
|
||||
|
||||
|
||||
#define alpha_r vs51
|
||||
#define alpha_i vs55
|
||||
#define save_permute_1 vs59
|
||||
#define permute_mask vs63
|
||||
#define o0 0
|
||||
|
||||
|
||||
#define T1 r11
|
||||
#define T2 r12
|
||||
#define T3 r14
|
||||
#define T4 r15
|
||||
#define T5 r16
|
||||
#define T6 r17
|
||||
#define L r18
|
||||
#define T7 r19
|
||||
#define T8 r20
|
||||
#define TEMP_REG r21
|
||||
#define I r22
|
||||
#define J r23
|
||||
#define AO r24
|
||||
#define BO r25
|
||||
#define CO r26
|
||||
#define T9 r27
|
||||
#define T10 r28
|
||||
#define PRE r29
|
||||
|
||||
#define T12 r30
|
||||
#define T13 r31
|
||||
|
||||
#include "cgemm_macros_power10.S"
|
||||
|
||||
.equ perm_const1, 0x0405060700010203
|
||||
.equ perm_const2, 0x0c0d0e0f08090a0b
|
||||
.equ save_permute_12, 0x0c0d0e0f1c1d1e1f
|
||||
.equ save_permute_11, 0x0405060714151617
|
||||
|
||||
|
||||
|
||||
#ifndef NEEDPARAM
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
|
||||
|
||||
addi SP, SP, -STACKSIZE
|
||||
mflr r0
|
||||
|
||||
|
||||
stfd f14, 0(SP)
|
||||
stfd f15, 8(SP)
|
||||
stfd f16, 16(SP)
|
||||
stfd f17, 24(SP)
|
||||
|
||||
stfd f18, 32(SP)
|
||||
stfd f19, 40(SP)
|
||||
stfd f20, 48(SP)
|
||||
stfd f21, 56(SP)
|
||||
|
||||
stfd f22, 64(SP)
|
||||
stfd f23, 72(SP)
|
||||
stfd f24, 80(SP)
|
||||
stfd f25, 88(SP)
|
||||
|
||||
stfd f26, 96(SP)
|
||||
stfd f27, 104(SP)
|
||||
stfd f28, 112(SP)
|
||||
stfd f29, 120(SP)
|
||||
|
||||
stfd f30, 128(SP)
|
||||
stfd f31, 136(SP)
|
||||
|
||||
|
||||
std r31, 144(SP)
|
||||
std r30, 152(SP)
|
||||
std r29, 160(SP)
|
||||
std r28, 168(SP)
|
||||
std r27, 176(SP)
|
||||
std r26, 184(SP)
|
||||
std r25, 192(SP)
|
||||
std r24, 200(SP)
|
||||
std r23, 208(SP)
|
||||
std r22, 216(SP)
|
||||
std r21, 224(SP)
|
||||
std r20, 232(SP)
|
||||
std r19, 240(SP)
|
||||
std r18, 248(SP)
|
||||
std r17, 256(SP)
|
||||
std r16, 264(SP)
|
||||
std r15, 272(SP)
|
||||
std r14, 280(SP)
|
||||
|
||||
|
||||
stxv vs52, 288(SP)
|
||||
stxv vs53, 304(SP)
|
||||
stxv vs54, 320(SP)
|
||||
stxv vs55, 336(SP)
|
||||
stxv vs56, 352(SP)
|
||||
stxv vs57, 368(SP)
|
||||
stxv vs58, 384(SP)
|
||||
stxv vs59, 400(SP)
|
||||
stxv vs60, 416(SP)
|
||||
stxv vs61, 432(SP)
|
||||
stxv vs62, 448(SP)
|
||||
stxv vs63, 464(SP)
|
||||
std r0, FLINK_SAVE(SP)
|
||||
|
||||
|
||||
|
||||
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
|
||||
|
||||
|
||||
#ifdef TRMMKERNEL
|
||||
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
|
||||
#endif
|
||||
slwi LDC, LDC, ZBASE_SHIFT
|
||||
|
||||
|
||||
|
||||
/*alpha is stored in f1. convert to single and splat*/
|
||||
xscvdpspn alpha_r,vs1
|
||||
xscvdpspn alpha_i,vs2
|
||||
xxspltw alpha_r,alpha_r,0
|
||||
xxspltw alpha_i,alpha_i,0
|
||||
/*load reverse permute mask for big endian
|
||||
uint128 = 0xc0d0e0f08090a0b0405060700010203
|
||||
*/
|
||||
|
||||
lis T2, perm_const2@highest
|
||||
lis T1, perm_const1@highest
|
||||
lis T3, save_permute_12@highest
|
||||
lis T4, save_permute_11@highest
|
||||
|
||||
|
||||
ori T2, T2, perm_const2@higher
|
||||
ori T1, T1, perm_const1@higher
|
||||
ori T3, T3, save_permute_12@higher
|
||||
ori T4, T4, save_permute_11@higher
|
||||
|
||||
|
||||
rldicr T2, T2, 32, 31
|
||||
rldicr T1, T1, 32, 31
|
||||
rldicr T3, T3, 32, 31
|
||||
rldicr T4, T4, 32, 31
|
||||
|
||||
oris T2, T2, perm_const2@h
|
||||
oris T1, T1, perm_const1@h
|
||||
oris T3, T3, save_permute_12@h
|
||||
oris T4, T4, save_permute_11@h
|
||||
|
||||
|
||||
ori T2, T2, perm_const2@l
|
||||
ori T1, T1, perm_const1@l
|
||||
ori T3, T3, save_permute_12@l
|
||||
ori T4, T4, save_permute_11@l
|
||||
|
||||
|
||||
li r0,0
|
||||
li PRE,512
|
||||
|
||||
#if defined(CC) || defined(CR) || defined(RC) || defined(RR)
|
||||
/*negate for this case as we will use addition -1*(a+b) */
|
||||
xvnegsp alpha_r,alpha_r
|
||||
xvnegsp alpha_i,alpha_i
|
||||
#endif
|
||||
|
||||
mtvsrdd permute_mask,T2,T1
|
||||
mtvsrdd save_permute_1,T3,T4
|
||||
|
||||
/*mask is reverse permute so we have to make it inner permute */
|
||||
xxpermdi permute_mask, permute_mask, permute_mask,2
|
||||
|
||||
#include "cgemm_logic_power10.S"
|
||||
|
||||
.L999:
|
||||
lfd f14, 0(SP)
|
||||
lfd f15, 8(SP)
|
||||
lfd f16, 16(SP)
|
||||
lfd f17, 24(SP)
|
||||
|
||||
lfd f18, 32(SP)
|
||||
lfd f19, 40(SP)
|
||||
lfd f20, 48(SP)
|
||||
lfd f21, 56(SP)
|
||||
|
||||
lfd f22, 64(SP)
|
||||
lfd f23, 72(SP)
|
||||
lfd f24, 80(SP)
|
||||
lfd f25, 88(SP)
|
||||
|
||||
lfd f26, 96(SP)
|
||||
lfd f27, 104(SP)
|
||||
lfd f28, 112(SP)
|
||||
lfd f29, 120(SP)
|
||||
|
||||
lfd f30, 128(SP)
|
||||
lfd f31, 136(SP)
|
||||
|
||||
ld r31, 144(SP)
|
||||
ld r30, 152(SP)
|
||||
ld r29, 160(SP)
|
||||
ld r28, 168(SP)
|
||||
ld r27, 176(SP)
|
||||
ld r26, 184(SP)
|
||||
ld r25, 192(SP)
|
||||
ld r24, 200(SP)
|
||||
ld r23, 208(SP)
|
||||
ld r22, 216(SP)
|
||||
ld r21, 224(SP)
|
||||
ld r20, 232(SP)
|
||||
ld r19, 240(SP)
|
||||
ld r18, 248(SP)
|
||||
ld r17, 256(SP)
|
||||
ld r16, 264(SP)
|
||||
ld r15, 272(SP)
|
||||
ld r14, 280(SP)
|
||||
|
||||
ld r0, FLINK_SAVE(SP)
|
||||
|
||||
lxv vs52, 288(SP)
|
||||
lxv vs53, 304(SP)
|
||||
lxv vs54, 320(SP)
|
||||
lxv vs55, 336(SP)
|
||||
lxv vs56, 352(SP)
|
||||
lxv vs57, 368(SP)
|
||||
lxv vs58, 384(SP)
|
||||
lxv vs59, 400(SP)
|
||||
mtlr r0
|
||||
lxv vs60, 416(SP)
|
||||
lxv vs61, 432(SP)
|
||||
lxv vs62, 448(SP)
|
||||
lxv vs63, 464(SP)
|
||||
|
||||
addi SP, SP, STACKSIZE
|
||||
blr
|
||||
|
||||
|
||||
EPILOGUE
|
||||
#endif
|
File diff suppressed because it is too large
Load Diff
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue