commit
e0fa24b216
|
@ -0,0 +1,81 @@
|
||||||
|
name: continuous build
|
||||||
|
|
||||||
|
on: [push, pull_request]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build:
|
||||||
|
runs-on: ${{ matrix.os }}
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
os: [ubuntu-latest, macos-latest]
|
||||||
|
build: [cmake, make]
|
||||||
|
steps:
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@v2
|
||||||
|
|
||||||
|
- name: Compilation cache
|
||||||
|
uses: actions/cache@v2
|
||||||
|
with:
|
||||||
|
path: ~/.ccache
|
||||||
|
# We include the commit sha in the cache key, as new cache entries are
|
||||||
|
# only created if there is no existing entry for the key yet.
|
||||||
|
key: ${{ runner.os }}-ccache-${{ github.sha }}
|
||||||
|
# Restore any ccache cache entry, if none for
|
||||||
|
# ${{ runner.os }}-ccache-${{ github.sha }} exists
|
||||||
|
restore-keys: |
|
||||||
|
${{ runner.os }}-ccache
|
||||||
|
|
||||||
|
- name: Print system information
|
||||||
|
run: |
|
||||||
|
if [ "$RUNNER_OS" == "Linux" ]; then
|
||||||
|
cat /proc/cpuinfo
|
||||||
|
elif [ "$RUNNER_OS" == "macOS" ]; then
|
||||||
|
sysctl -a | grep machdep.cpu
|
||||||
|
else
|
||||||
|
echo "$RUNNER_OS not supported"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
- name: Install Dependencies
|
||||||
|
run: |
|
||||||
|
if [ "$RUNNER_OS" == "Linux" ]; then
|
||||||
|
sudo apt-get install -y gfortran cmake ccache
|
||||||
|
elif [ "$RUNNER_OS" == "macOS" ]; then
|
||||||
|
brew install coreutils cmake ccache
|
||||||
|
else
|
||||||
|
echo "$RUNNER_OS not supported"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
ccache -M 300M # Limit the ccache size; Github's overall cache limit is 5GB
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
if: matrix.build == 'make'
|
||||||
|
run: |
|
||||||
|
if [ "$RUNNER_OS" == "Linux" ]; then
|
||||||
|
export PATH="/usr/lib/ccache:${PATH}"
|
||||||
|
elif [ "$RUNNER_OS" == "macOS" ]; then
|
||||||
|
export PATH="$(brew --prefix)/opt/ccache/libexec:${PATH}"
|
||||||
|
else
|
||||||
|
echo "$RUNNER_OS not supported"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0
|
||||||
|
|
||||||
|
- name: CMake build
|
||||||
|
if: matrix.build == 'cmake'
|
||||||
|
run: |
|
||||||
|
if [ "$RUNNER_OS" == "Linux" ]; then
|
||||||
|
export PATH="/usr/lib/ccache:${PATH}"
|
||||||
|
elif [ "$RUNNER_OS" == "macOS" ]; then
|
||||||
|
export PATH="$(brew --prefix)/opt/ccache/libexec:${PATH}"
|
||||||
|
else
|
||||||
|
echo "$RUNNER_OS not supported"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
mkdir build
|
||||||
|
cd build
|
||||||
|
cmake -DDYNAMIC_ARCH=1 -DNOFORTRAN=0 -DBUILD_WITHOUT_LAPACK=0 -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_BUILD_TYPE=Release ..
|
||||||
|
make -j$(nproc)
|
|
@ -21,6 +21,7 @@ jobs:
|
||||||
build-OpenBLAS-with-Homebrew:
|
build-OpenBLAS-with-Homebrew:
|
||||||
runs-on: macos-latest
|
runs-on: macos-latest
|
||||||
env:
|
env:
|
||||||
|
DEVELOPER_DIR: /Applications/Xcode_11.4.1.app/Contents/Developer
|
||||||
HOMEBREW_DEVELOPER: "ON"
|
HOMEBREW_DEVELOPER: "ON"
|
||||||
HOMEBREW_DISPLAY_INSTALL_TIMES: "ON"
|
HOMEBREW_DISPLAY_INSTALL_TIMES: "ON"
|
||||||
HOMEBREW_NO_ANALYTICS: "ON"
|
HOMEBREW_NO_ANALYTICS: "ON"
|
||||||
|
|
|
@ -70,6 +70,7 @@ test/SBLAT2.SUMM
|
||||||
test/SBLAT3.SUMM
|
test/SBLAT3.SUMM
|
||||||
test/ZBLAT2.SUMM
|
test/ZBLAT2.SUMM
|
||||||
test/ZBLAT3.SUMM
|
test/ZBLAT3.SUMM
|
||||||
|
test/SHBLAT3.SUMM
|
||||||
test/cblat1
|
test/cblat1
|
||||||
test/cblat2
|
test/cblat2
|
||||||
test/cblat3
|
test/cblat3
|
||||||
|
@ -79,6 +80,7 @@ test/dblat3
|
||||||
test/sblat1
|
test/sblat1
|
||||||
test/sblat2
|
test/sblat2
|
||||||
test/sblat3
|
test/sblat3
|
||||||
|
test/test_shgemm
|
||||||
test/zblat1
|
test/zblat1
|
||||||
test/zblat2
|
test/zblat2
|
||||||
test/zblat3
|
test/zblat3
|
||||||
|
|
|
@ -16,7 +16,6 @@ matrix:
|
||||||
before_script: &common-before
|
before_script: &common-before
|
||||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
|
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
|
||||||
script:
|
script:
|
||||||
- set -e
|
|
||||||
- make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
- make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||||
- make -C test $COMMON_FLAGS $BTYPE
|
- make -C test $COMMON_FLAGS $BTYPE
|
||||||
- make -C ctest $COMMON_FLAGS $BTYPE
|
- make -C ctest $COMMON_FLAGS $BTYPE
|
||||||
|
@ -108,7 +107,6 @@ matrix:
|
||||||
- sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers'
|
- sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers'
|
||||||
before_script: *common-before
|
before_script: *common-before
|
||||||
script:
|
script:
|
||||||
- set -e
|
|
||||||
# XXX: Disable some warnings for now to avoid exceeding Travis limit for log size.
|
# XXX: Disable some warnings for now to avoid exceeding Travis limit for log size.
|
||||||
- alpine make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
- alpine make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||||
CFLAGS="-Wno-misleading-indentation -Wno-sign-conversion -Wno-incompatible-pointer-types"
|
CFLAGS="-Wno-misleading-indentation -Wno-sign-conversion -Wno-incompatible-pointer-types"
|
||||||
|
@ -151,7 +149,6 @@ matrix:
|
||||||
before_script:
|
before_script:
|
||||||
- COMMON_ARGS="-DTARGET=NEHALEM -DNUM_THREADS=32"
|
- COMMON_ARGS="-DTARGET=NEHALEM -DNUM_THREADS=32"
|
||||||
script:
|
script:
|
||||||
- set -e
|
|
||||||
- mkdir build
|
- mkdir build
|
||||||
- CONFIG=Release
|
- CONFIG=Release
|
||||||
- cmake -Bbuild -H. $CMAKE_ARGS $COMMON_ARGS -DCMAKE_BUILD_TYPE=$CONFIG
|
- cmake -Bbuild -H. $CMAKE_ARGS $COMMON_ARGS -DCMAKE_BUILD_TYPE=$CONFIG
|
||||||
|
|
|
@ -6,7 +6,8 @@ cmake_minimum_required(VERSION 2.8.5)
|
||||||
project(OpenBLAS C ASM)
|
project(OpenBLAS C ASM)
|
||||||
set(OpenBLAS_MAJOR_VERSION 0)
|
set(OpenBLAS_MAJOR_VERSION 0)
|
||||||
set(OpenBLAS_MINOR_VERSION 3)
|
set(OpenBLAS_MINOR_VERSION 3)
|
||||||
set(OpenBLAS_PATCH_VERSION 9.dev)
|
set(OpenBLAS_PATCH_VERSION 10.dev)
|
||||||
|
|
||||||
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
|
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
|
||||||
|
|
||||||
# Adhere to GNU filesystem layout conventions
|
# Adhere to GNU filesystem layout conventions
|
||||||
|
@ -23,6 +24,7 @@ option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS fun
|
||||||
option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF)
|
option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF)
|
||||||
option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF)
|
option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF)
|
||||||
option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF)
|
option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF)
|
||||||
|
option(USE_LOCKING "Use locks even in single-threaded builds to make them callable from multiple threads" OFF)
|
||||||
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
|
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
|
||||||
option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON)
|
option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON)
|
||||||
else()
|
else()
|
||||||
|
@ -86,9 +88,13 @@ if (NOT NO_LAPACK)
|
||||||
list(APPEND SUBDIRS lapack)
|
list(APPEND SUBDIRS lapack)
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
|
if (NOT DEFINED BUILD_HALF)
|
||||||
|
set (BUILD_HALF false)
|
||||||
|
endif ()
|
||||||
# set which float types we want to build for
|
# set which float types we want to build for
|
||||||
if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16)
|
if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16)
|
||||||
# if none are defined, build for all
|
# if none are defined, build for all
|
||||||
|
# set(BUILD_HALF true)
|
||||||
set(BUILD_SINGLE true)
|
set(BUILD_SINGLE true)
|
||||||
set(BUILD_DOUBLE true)
|
set(BUILD_DOUBLE true)
|
||||||
set(BUILD_COMPLEX true)
|
set(BUILD_COMPLEX true)
|
||||||
|
@ -120,6 +126,11 @@ if (BUILD_COMPLEX16)
|
||||||
list(APPEND FLOAT_TYPES "ZCOMPLEX") # defines COMPLEX and DOUBLE
|
list(APPEND FLOAT_TYPES "ZCOMPLEX") # defines COMPLEX and DOUBLE
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
|
if (BUILD_HALF)
|
||||||
|
message(STATUS "Building Half Precision")
|
||||||
|
list(APPEND FLOAT_TYPES "HALF") # defines nothing
|
||||||
|
endif ()
|
||||||
|
|
||||||
if (NOT DEFINED CORE OR "${CORE}" STREQUAL "UNKNOWN")
|
if (NOT DEFINED CORE OR "${CORE}" STREQUAL "UNKNOWN")
|
||||||
message(FATAL_ERROR "Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for details.")
|
message(FATAL_ERROR "Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for details.")
|
||||||
endif ()
|
endif ()
|
||||||
|
@ -234,7 +245,7 @@ if (BUILD_SHARED_LIBS AND BUILD_RELAPACK)
|
||||||
if (NOT MSVC)
|
if (NOT MSVC)
|
||||||
target_link_libraries(${OpenBLAS_LIBNAME} "-Wl,-allow-multiple-definition")
|
target_link_libraries(${OpenBLAS_LIBNAME} "-Wl,-allow-multiple-definition")
|
||||||
else()
|
else()
|
||||||
target_link_libraries(${OpenBLAS_LIBNAME} "/FORCE:MULTIPLE")
|
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /FORCE:MULTIPLE")
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
|
|
@ -180,3 +180,13 @@ In chronological order:
|
||||||
* [2019-12-23] optimize AVX2 CGEMM and ZGEMM
|
* [2019-12-23] optimize AVX2 CGEMM and ZGEMM
|
||||||
* [2019-12-30] AVX2 CGEMM3M & ZGEMM3M kernels
|
* [2019-12-30] AVX2 CGEMM3M & ZGEMM3M kernels
|
||||||
* [2020-01-07] optimize AVX2 SGEMM and STRMM
|
* [2020-01-07] optimize AVX2 SGEMM and STRMM
|
||||||
|
|
||||||
|
* Rajalakshmi Srinivasaraghavan <https://github.com/RajalakshmiSR>
|
||||||
|
* [2020-04-15] Half-precision GEMM for bfloat16
|
||||||
|
|
||||||
|
* Marius Hillenbrand <https://github.com/mhillenibm>
|
||||||
|
* [2020-05-12] Revise dynamic architecture detection for IBM z
|
||||||
|
* [2020-05-12] Add new sgemm and strmm kernel for IBM z14
|
||||||
|
|
||||||
|
* Danfeng Zhang <https://github.com/craft-zhang>
|
||||||
|
* [2020-05-20] Improve performance of SGEMM and STRMM on Arm Cortex-A53
|
|
@ -1,4 +1,77 @@
|
||||||
OpenBLAS ChangeLog
|
OpenBLAS ChangeLog
|
||||||
|
====================================================================
|
||||||
|
Version 0.3.10
|
||||||
|
14-Jun-2020
|
||||||
|
|
||||||
|
common:
|
||||||
|
* Improved thread locking behaviour in blas_server and parallel getrf
|
||||||
|
* Imported bugfix 394 from LAPACK (spurious reference to "XERBL"
|
||||||
|
due to overlong lines)
|
||||||
|
* Imported bugfix 403 from LAPACK (compile option "recursive" required
|
||||||
|
for correctness with Intel and PGI)
|
||||||
|
* Imported bugfix 408 from LAPACK (wrong scaling in ZHEEQUB)
|
||||||
|
* Imported bugfix 411 from LAPACK (infinite loop in LARGV/LARTG/LARTGP)
|
||||||
|
* Fixed mismatches between BUFFERSIZE and GEMM_UNROLL parameters that
|
||||||
|
could lead to crashes at large matrix sizes
|
||||||
|
* Restored internal soname in dynamic libraries on FreeBSD and Dragonfly
|
||||||
|
* Added API (openblas_setaffinity) to set the thread affinity on Linux
|
||||||
|
* Added initial infrastructure for half-precision floating point
|
||||||
|
(bfloat16) support with a generic implementation of SHGEMM
|
||||||
|
* Added CMAKE build system support for building the cblas_Xgemm3m
|
||||||
|
functions
|
||||||
|
* Fixed CMAKE support for building in a path with embedded spaces
|
||||||
|
* Fixed CMAKE (non)handling of NO_EXPRECISION and MAX_STACK_ALLOC
|
||||||
|
* Fixed GCC version detection in the Makefiles
|
||||||
|
* Allowed overriding the names of AR, AS and LD in Makefile builds
|
||||||
|
|
||||||
|
POWER:
|
||||||
|
* Fixed big-endian POWER8 ELFv2 builds on FreeBSD
|
||||||
|
* Fixed GCC version checks and DYNAMIC_ARCH builds on POWER9
|
||||||
|
* Fixed CMAKE build support for POWER9
|
||||||
|
* fixed a potential race condition in the thread buffer allocation
|
||||||
|
* Worked around LAPACK test failures on PPC G4
|
||||||
|
|
||||||
|
MIPS:
|
||||||
|
* Fixed a potential race condition in the thread buffer allocation
|
||||||
|
* Added support for MIPS 24K/24KE family based on P5600 kernels
|
||||||
|
|
||||||
|
MIPS64:
|
||||||
|
* fixed a potential race condition in the thread buffer allocation
|
||||||
|
* Added TARGET=GENERIC
|
||||||
|
|
||||||
|
ARMV7:
|
||||||
|
* Fixed a race condition in the thread buffer allocation
|
||||||
|
|
||||||
|
ARMV8:
|
||||||
|
* Fixed a race condition in the thread buffer allocation
|
||||||
|
* Fixed zero initialisation in the assembly for SGEMM and DGEMM BETA
|
||||||
|
* Improved performance of the ThunderX2 DAXPY kernel
|
||||||
|
* Added an optimized SGEMM kernel for Cortex A53
|
||||||
|
* Fixed Makefile support for INTERFACE64 (8-byte integer)
|
||||||
|
|
||||||
|
x86_64:
|
||||||
|
* Fixed a syntax error in the CMAKE setup for SkylakeX
|
||||||
|
* Improved performance of STRSM on Haswell, SkylakeX and Ryzen
|
||||||
|
* Improved SGEMM performance on SGEMM for workloads with ldc a
|
||||||
|
multiple of 1024
|
||||||
|
* Improved DGEMM performance on Skylake X
|
||||||
|
* Fixed unwanted AVX512-dependency of SGEMM in DYNAMIC_ARCH
|
||||||
|
builds created on SkylakeX
|
||||||
|
* Removed data alignment requirement in the SSE2 copy kernels
|
||||||
|
that could cause spurious crashes
|
||||||
|
* Added a workaround for an optimizer bug in AppleClang 11.0.3
|
||||||
|
* Fixed LAPACK test failures due to wrong options for Intel Fortran
|
||||||
|
* Fixed compilation and LAPACK test results with recent Flang
|
||||||
|
and AMD AOCC
|
||||||
|
* Fixed DYNAMIC_ARCH builds with CMAKE on OS X
|
||||||
|
* Fixed missing exports of cblas_i?amin, cblas_i?min, cblas_i?max,
|
||||||
|
cblas_?sum, cblas_?gemm3m in the shared library on OS
|
||||||
|
* Fixed reporting of cpu name in DYNAMIC_ARCH builds (would sometimes
|
||||||
|
show the name of an older generation chip supported by the same kernels)
|
||||||
|
|
||||||
|
IBM Z:
|
||||||
|
* Improved performance of SGEMM/STRMM and DGEMM/DTRMM on Z14
|
||||||
|
|
||||||
====================================================================
|
====================================================================
|
||||||
Version 0.3.9
|
Version 0.3.9
|
||||||
1-Mar-2020
|
1-Mar-2020
|
||||||
|
|
1
Makefile
1
Makefile
|
@ -264,6 +264,7 @@ lapack_prebuild :
|
||||||
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
||||||
-@echo "FC = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "FC = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
-@echo "FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
|
-@echo "FFLAGS_DRV = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
-@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
-@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
-@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
|
|
|
@ -9,6 +9,16 @@ else
|
||||||
USE_OPENMP = 1
|
USE_OPENMP = 1
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(CORE), POWER10)
|
||||||
|
ifeq ($(USE_OPENMP), 1)
|
||||||
|
COMMON_OPT += -Ofast -mcpu=future -mtune=future -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
|
||||||
|
FCOMMON_OPT += -O2 -frecursive -mcpu=future -mtune=future -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
|
||||||
|
else
|
||||||
|
COMMON_OPT += -Ofast -mcpu=future -mtune=future -mvsx -malign-power -fno-fast-math
|
||||||
|
FCOMMON_OPT += -O2 -frecursive -mcpu=future -mtune=future -malign-power -fno-fast-math
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq ($(CORE), POWER9)
|
ifeq ($(CORE), POWER9)
|
||||||
ifeq ($(USE_OPENMP), 1)
|
ifeq ($(USE_OPENMP), 1)
|
||||||
COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
|
COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
|
||||||
|
|
|
@ -17,7 +17,11 @@ ifdef CPUIDEMU
|
||||||
EXFLAGS = -DCPUIDEMU -DVENDOR=99
|
EXFLAGS = -DCPUIDEMU -DVENDOR=99
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(TARGET), 1004K)
|
ifeq ($(TARGET), MIPS24K)
|
||||||
|
TARGET_FLAGS = -mips32r2
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(TARGET), MIPS1004K)
|
||||||
TARGET_FLAGS = -mips32r2
|
TARGET_FLAGS = -mips32r2
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
#
|
#
|
||||||
|
|
||||||
# This library's version
|
# This library's version
|
||||||
VERSION = 0.3.9.dev
|
VERSION = 0.3.10.dev
|
||||||
|
|
||||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||||
|
@ -273,6 +273,9 @@ COMMON_PROF = -pg
|
||||||
#
|
#
|
||||||
# CPP_THREAD_SAFETY_TEST = 1
|
# CPP_THREAD_SAFETY_TEST = 1
|
||||||
|
|
||||||
|
|
||||||
|
# If you want to enable the experimental BFLOAT16 support
|
||||||
|
# BUILD_HALF = 1
|
||||||
#
|
#
|
||||||
# End of user configuration
|
# End of user configuration
|
||||||
#
|
#
|
||||||
|
|
|
@ -21,6 +21,8 @@ ifeq ($(ARCH), amd64)
|
||||||
override ARCH=x86_64
|
override ARCH=x86_64
|
||||||
else ifeq ($(ARCH), powerpc64)
|
else ifeq ($(ARCH), powerpc64)
|
||||||
override ARCH=power
|
override ARCH=power
|
||||||
|
else ifeq ($(ARCH), powerpc)
|
||||||
|
override ARCH=power
|
||||||
else ifeq ($(ARCH), i386)
|
else ifeq ($(ARCH), i386)
|
||||||
override ARCH=x86
|
override ARCH=x86
|
||||||
else ifeq ($(ARCH), aarch64)
|
else ifeq ($(ARCH), aarch64)
|
||||||
|
@ -261,10 +263,10 @@ endif
|
||||||
|
|
||||||
ARFLAGS =
|
ARFLAGS =
|
||||||
CPP = $(COMPILER) -E
|
CPP = $(COMPILER) -E
|
||||||
AR = $(CROSS_SUFFIX)ar
|
AR ?= $(CROSS_SUFFIX)ar
|
||||||
AS = $(CROSS_SUFFIX)as
|
AS ?= $(CROSS_SUFFIX)as
|
||||||
LD = $(CROSS_SUFFIX)ld
|
LD ?= $(CROSS_SUFFIX)ld
|
||||||
RANLIB = $(CROSS_SUFFIX)ranlib
|
RANLIB ?= $(CROSS_SUFFIX)ranlib
|
||||||
NM = $(CROSS_SUFFIX)nm
|
NM = $(CROSS_SUFFIX)nm
|
||||||
DLLWRAP = $(CROSS_SUFFIX)dllwrap
|
DLLWRAP = $(CROSS_SUFFIX)dllwrap
|
||||||
OBJCOPY = $(CROSS_SUFFIX)objcopy
|
OBJCOPY = $(CROSS_SUFFIX)objcopy
|
||||||
|
@ -277,6 +279,17 @@ NO_LAPACK = 1
|
||||||
override FEXTRALIB =
|
override FEXTRALIB =
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(C_COMPILER), GCC)
|
||||||
|
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
|
||||||
|
GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4)
|
||||||
|
GCCVERSIONEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` = 5)
|
||||||
|
GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5)
|
||||||
|
GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7)
|
||||||
|
GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
|
||||||
|
GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 2)
|
||||||
|
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
|
||||||
|
endif
|
||||||
|
|
||||||
#
|
#
|
||||||
# OS dependent settings
|
# OS dependent settings
|
||||||
#
|
#
|
||||||
|
@ -323,13 +336,7 @@ ifeq ($(C_COMPILER), CLANG)
|
||||||
CCOMMON_OPT += -DMS_ABI
|
CCOMMON_OPT += -DMS_ABI
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(C_COMPILER), GCC)
|
|
||||||
#Version tests for supporting specific features (MS_ABI, POWER9 intrinsics)
|
#Version tests for supporting specific features (MS_ABI, POWER9 intrinsics)
|
||||||
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
|
|
||||||
GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4)
|
|
||||||
GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7)
|
|
||||||
GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
|
|
||||||
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
|
|
||||||
ifeq ($(GCCVERSIONGT4), 1)
|
ifeq ($(GCCVERSIONGT4), 1)
|
||||||
# GCC Major version > 4
|
# GCC Major version > 4
|
||||||
# It is compatible with MSVC ABI.
|
# It is compatible with MSVC ABI.
|
||||||
|
@ -343,7 +350,6 @@ ifeq ($(GCCMINORVERSIONGTEQ7), 1)
|
||||||
CCOMMON_OPT += -DMS_ABI
|
CCOMMON_OPT += -DMS_ABI
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
endif
|
|
||||||
|
|
||||||
# Ensure the correct stack alignment on Win32
|
# Ensure the correct stack alignment on Win32
|
||||||
# http://permalink.gmane.org/gmane.comp.lib.openblas.general/97
|
# http://permalink.gmane.org/gmane.comp.lib.openblas.general/97
|
||||||
|
@ -563,8 +569,34 @@ DYNAMIC_CORE += EMAG8180
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(ARCH), zarch)
|
ifeq ($(ARCH), zarch)
|
||||||
DYNAMIC_CORE = Z13
|
DYNAMIC_CORE = ZARCH_GENERIC
|
||||||
|
|
||||||
|
# Z13 is supported since gcc-5.2, gcc-6, and in RHEL 7.3 and newer
|
||||||
|
ifeq ($(GCCVERSIONGT5), 1)
|
||||||
|
ZARCH_SUPPORT_Z13 := 1
|
||||||
|
else ifeq ($(GCCVERSIONEQ5), 1)
|
||||||
|
ifeq ($(GCCMINORVERSIONGTEQ2), 1)
|
||||||
|
ZARCH_SUPPORT_Z13 := 1
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(wildcard /etc/redhat-release), /etc/redhat-release)
|
||||||
|
ifeq ($(shell source /etc/os-release ; expr $$VERSION_ID \>= "7.3"), 1)
|
||||||
|
ZARCH_SUPPORT_Z13 := 1
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(ZARCH_SUPPORT_Z13), 1)
|
||||||
|
DYNAMIC_CORE += Z13
|
||||||
|
else
|
||||||
|
$(info OpenBLAS: Not building Z13 kernels because gcc is older than 5.2 or 6.x)
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(GCCVERSIONGTEQ7), 1)
|
||||||
DYNAMIC_CORE += Z14
|
DYNAMIC_CORE += Z14
|
||||||
|
else
|
||||||
|
$(info OpenBLAS: Not building Z14 kernels because gcc is older than 7.x)
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(ARCH), power)
|
ifeq ($(ARCH), power)
|
||||||
|
@ -572,14 +604,20 @@ DYNAMIC_CORE = POWER6
|
||||||
DYNAMIC_CORE += POWER8
|
DYNAMIC_CORE += POWER8
|
||||||
ifneq ($(C_COMPILER), GCC)
|
ifneq ($(C_COMPILER), GCC)
|
||||||
DYNAMIC_CORE += POWER9
|
DYNAMIC_CORE += POWER9
|
||||||
|
DYNAMIC_CORE += POWER10
|
||||||
endif
|
endif
|
||||||
ifeq ($(C_COMPILER), GCC)
|
ifeq ($(C_COMPILER), GCC)
|
||||||
GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5)
|
|
||||||
ifeq ($(GCCVERSIONGT5), 1)
|
ifeq ($(GCCVERSIONGT5), 1)
|
||||||
DYNAMIC_CORE += POWER9
|
DYNAMIC_CORE += POWER9
|
||||||
else
|
else
|
||||||
$(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.)
|
$(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.)
|
||||||
endif
|
endif
|
||||||
|
GCCVERSIONGTEQ11 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 11)
|
||||||
|
ifeq ($(GCCVERSIONGTEQ11), 1)
|
||||||
|
DYNAMIC_CORE += POWER10
|
||||||
|
else
|
||||||
|
$(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.)
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
@ -690,7 +728,12 @@ CCOMMON_OPT += -march=mips64
|
||||||
FCOMMON_OPT += -march=mips64
|
FCOMMON_OPT += -march=mips64
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(CORE), 1004K)
|
ifeq ($(CORE), MIPS24K)
|
||||||
|
CCOMMON_OPT += -mips32r2 -mtune=24kc $(MSA_FLAGS)
|
||||||
|
FCOMMON_OPT += -mips32r2 -mtune=24kc $(MSA_FLAGS)
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(CORE), MIPS1004K)
|
||||||
CCOMMON_OPT += -mips32r2 $(MSA_FLAGS)
|
CCOMMON_OPT += -mips32r2 $(MSA_FLAGS)
|
||||||
FCOMMON_OPT += -mips32r2 $(MSA_FLAGS)
|
FCOMMON_OPT += -mips32r2 $(MSA_FLAGS)
|
||||||
endif
|
endif
|
||||||
|
@ -755,6 +798,15 @@ endif
|
||||||
|
|
||||||
ifeq ($(F_COMPILER), FLANG)
|
ifeq ($(F_COMPILER), FLANG)
|
||||||
CCOMMON_OPT += -DF_INTERFACE_FLANG
|
CCOMMON_OPT += -DF_INTERFACE_FLANG
|
||||||
|
FCOMMON_OPT += -Mrecursive -Kieee
|
||||||
|
ifeq ($(OSNAME), Linux)
|
||||||
|
ifeq ($(ARCH), x86_64)
|
||||||
|
FLANG_VENDOR := $(shell expr `$(FC) --version|cut -f 1 -d "."|head -1`)
|
||||||
|
ifeq ($(FLANG_VENDOR),AOCC)
|
||||||
|
FCOMMON_OPT += -fno-unroll-loops
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
endif
|
||||||
ifdef BINARY64
|
ifdef BINARY64
|
||||||
ifdef INTERFACE64
|
ifdef INTERFACE64
|
||||||
ifneq ($(INTERFACE64), 0)
|
ifneq ($(INTERFACE64), 0)
|
||||||
|
@ -850,7 +902,7 @@ ifneq ($(INTERFACE64), 0)
|
||||||
FCOMMON_OPT += -i8
|
FCOMMON_OPT += -i8
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
FCOMMON_OPT += -recursive
|
FCOMMON_OPT += -recursive -fp-model strict -assume protect-parens
|
||||||
ifeq ($(USE_OPENMP), 1)
|
ifeq ($(USE_OPENMP), 1)
|
||||||
FCOMMON_OPT += -fopenmp
|
FCOMMON_OPT += -fopenmp
|
||||||
endif
|
endif
|
||||||
|
@ -1119,6 +1171,10 @@ ifeq ($(USE_TLS), 1)
|
||||||
CCOMMON_OPT += -DUSE_TLS
|
CCOMMON_OPT += -DUSE_TLS
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(BUILD_HALF), 1)
|
||||||
|
CCOMMON_OPT += -DBUILD_HALF
|
||||||
|
endif
|
||||||
|
|
||||||
CCOMMON_OPT += -DVERSION=\"$(VERSION)\"
|
CCOMMON_OPT += -DVERSION=\"$(VERSION)\"
|
||||||
|
|
||||||
ifndef SYMBOLPREFIX
|
ifndef SYMBOLPREFIX
|
||||||
|
@ -1145,6 +1201,7 @@ KERNELDIR = $(TOPDIR)/kernel/$(ARCH)
|
||||||
|
|
||||||
include $(TOPDIR)/Makefile.$(ARCH)
|
include $(TOPDIR)/Makefile.$(ARCH)
|
||||||
|
|
||||||
|
CCOMMON_OPT += -UASMNAME -UASMFNAME -UNAME -UCNAME -UCHAR_NAME -UCHAR_CNAME
|
||||||
CCOMMON_OPT += -DASMNAME=$(FU)$(*F) -DASMFNAME=$(FU)$(*F)$(BU) -DNAME=$(*F)$(BU) -DCNAME=$(*F) -DCHAR_NAME=\"$(*F)$(BU)\" -DCHAR_CNAME=\"$(*F)\"
|
CCOMMON_OPT += -DASMNAME=$(FU)$(*F) -DASMFNAME=$(FU)$(*F)$(BU) -DNAME=$(*F)$(BU) -DCNAME=$(*F) -DCHAR_NAME=\"$(*F)$(BU)\" -DCHAR_CNAME=\"$(*F)\"
|
||||||
|
|
||||||
ifeq ($(CORE), PPC440)
|
ifeq ($(CORE), PPC440)
|
||||||
|
@ -1237,7 +1294,6 @@ endif
|
||||||
|
|
||||||
override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR)
|
override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR)
|
||||||
override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF)
|
override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF)
|
||||||
|
|
||||||
override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT)
|
override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT)
|
||||||
override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF)
|
override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF)
|
||||||
#MAKEOVERRIDES =
|
#MAKEOVERRIDES =
|
||||||
|
@ -1344,6 +1400,7 @@ export ARCH
|
||||||
export CORE
|
export CORE
|
||||||
export LIBCORE
|
export LIBCORE
|
||||||
export __BYTE_ORDER__
|
export __BYTE_ORDER__
|
||||||
|
export ELF_VERSION
|
||||||
export PGCPATH
|
export PGCPATH
|
||||||
export CONFIG
|
export CONFIG
|
||||||
export CC
|
export CC
|
||||||
|
@ -1389,7 +1446,10 @@ export KERNELDIR
|
||||||
export FUNCTION_PROFILE
|
export FUNCTION_PROFILE
|
||||||
export TARGET_CORE
|
export TARGET_CORE
|
||||||
export NO_AVX512
|
export NO_AVX512
|
||||||
|
export BUILD_HALF
|
||||||
|
|
||||||
|
export SHGEMM_UNROLL_M
|
||||||
|
export SHGEMM_UNROLL_N
|
||||||
export SGEMM_UNROLL_M
|
export SGEMM_UNROLL_M
|
||||||
export SGEMM_UNROLL_N
|
export SGEMM_UNROLL_N
|
||||||
export DGEMM_UNROLL_M
|
export DGEMM_UNROLL_M
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
SHBLASOBJS_P = $(SHBLASOBJS:.$(SUFFIX)=.$(PSUFFIX))
|
||||||
SBLASOBJS_P = $(SBLASOBJS:.$(SUFFIX)=.$(PSUFFIX))
|
SBLASOBJS_P = $(SBLASOBJS:.$(SUFFIX)=.$(PSUFFIX))
|
||||||
DBLASOBJS_P = $(DBLASOBJS:.$(SUFFIX)=.$(PSUFFIX))
|
DBLASOBJS_P = $(DBLASOBJS:.$(SUFFIX)=.$(PSUFFIX))
|
||||||
QBLASOBJS_P = $(QBLASOBJS:.$(SUFFIX)=.$(PSUFFIX))
|
QBLASOBJS_P = $(QBLASOBJS:.$(SUFFIX)=.$(PSUFFIX))
|
||||||
|
@ -9,8 +10,8 @@ COMMONOBJS_P = $(COMMONOBJS:.$(SUFFIX)=.$(PSUFFIX))
|
||||||
|
|
||||||
HPLOBJS_P = $(HPLOBJS:.$(SUFFIX)=.$(PSUFFIX))
|
HPLOBJS_P = $(HPLOBJS:.$(SUFFIX)=.$(PSUFFIX))
|
||||||
|
|
||||||
BLASOBJS = $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS)
|
BLASOBJS = $(SHBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS)
|
||||||
BLASOBJS_P = $(SBLASOBJS_P) $(DBLASOBJS_P) $(CBLASOBJS_P) $(ZBLASOBJS_P)
|
BLASOBJS_P = $(SHBLASOBJS_P) $(SBLASOBJS_P) $(DBLASOBJS_P) $(CBLASOBJS_P) $(ZBLASOBJS_P)
|
||||||
|
|
||||||
ifdef EXPRECISION
|
ifdef EXPRECISION
|
||||||
BLASOBJS += $(QBLASOBJS) $(XBLASOBJS)
|
BLASOBJS += $(QBLASOBJS) $(XBLASOBJS)
|
||||||
|
@ -22,6 +23,7 @@ BLASOBJS += $(QBLASOBJS) $(XBLASOBJS)
|
||||||
BLASOBJS_P += $(QBLASOBJS_P) $(XBLASOBJS_P)
|
BLASOBJS_P += $(QBLASOBJS_P) $(XBLASOBJS_P)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
$(SHBLASOBJS) $(SHBLASOBJS_P) : override CFLAGS += -DHALF -UDOUBLE -UCOMPLEX
|
||||||
$(SBLASOBJS) $(SBLASOBJS_P) : override CFLAGS += -UDOUBLE -UCOMPLEX
|
$(SBLASOBJS) $(SBLASOBJS_P) : override CFLAGS += -UDOUBLE -UCOMPLEX
|
||||||
$(DBLASOBJS) $(DBLASOBJS_P) : override CFLAGS += -DDOUBLE -UCOMPLEX
|
$(DBLASOBJS) $(DBLASOBJS_P) : override CFLAGS += -DDOUBLE -UCOMPLEX
|
||||||
$(QBLASOBJS) $(QBLASOBJS_P) : override CFLAGS += -DXDOUBLE -UCOMPLEX
|
$(QBLASOBJS) $(QBLASOBJS_P) : override CFLAGS += -DXDOUBLE -UCOMPLEX
|
||||||
|
@ -29,6 +31,7 @@ $(CBLASOBJS) $(CBLASOBJS_P) : override CFLAGS += -UDOUBLE -DCOMPLEX
|
||||||
$(ZBLASOBJS) $(ZBLASOBJS_P) : override CFLAGS += -DDOUBLE -DCOMPLEX
|
$(ZBLASOBJS) $(ZBLASOBJS_P) : override CFLAGS += -DDOUBLE -DCOMPLEX
|
||||||
$(XBLASOBJS) $(XBLASOBJS_P) : override CFLAGS += -DXDOUBLE -DCOMPLEX
|
$(XBLASOBJS) $(XBLASOBJS_P) : override CFLAGS += -DXDOUBLE -DCOMPLEX
|
||||||
|
|
||||||
|
$(SHBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
|
||||||
$(SBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
|
$(SBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
|
||||||
$(DBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
|
$(DBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
|
||||||
$(QBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
|
$(QBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
|
||||||
|
|
|
@ -5,6 +5,6 @@ FCOMMON_OPT += -march=z13 -mzvector
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(CORE), Z14)
|
ifeq ($(CORE), Z14)
|
||||||
CCOMMON_OPT += -march=z14 -mzvector
|
CCOMMON_OPT += -march=z14 -mzvector -O3
|
||||||
FCOMMON_OPT += -march=z14 -mzvector
|
FCOMMON_OPT += -march=z14 -mzvector
|
||||||
endif
|
endif
|
||||||
|
|
|
@ -122,6 +122,11 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
|
||||||
- **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations.
|
- **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations.
|
||||||
- **AMD ZEN**: Uses Haswell codes with some optimizations.
|
- **AMD ZEN**: Uses Haswell codes with some optimizations.
|
||||||
|
|
||||||
|
#### MIPS32
|
||||||
|
|
||||||
|
- **MIPS 1004K**: uses P5600 codes
|
||||||
|
- **MIPS 24K**: uses P5600 codes
|
||||||
|
|
||||||
#### MIPS64
|
#### MIPS64
|
||||||
|
|
||||||
- **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2.
|
- **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2.
|
||||||
|
|
|
@ -49,6 +49,7 @@ POWER6
|
||||||
POWER7
|
POWER7
|
||||||
POWER8
|
POWER8
|
||||||
POWER9
|
POWER9
|
||||||
|
POWER10
|
||||||
PPCG4
|
PPCG4
|
||||||
PPC970
|
PPC970
|
||||||
PPC970MP
|
PPC970MP
|
||||||
|
@ -58,7 +59,8 @@ CELL
|
||||||
|
|
||||||
3.MIPS CPU:
|
3.MIPS CPU:
|
||||||
P5600
|
P5600
|
||||||
1004K
|
MIPS1004K
|
||||||
|
MIPS24K
|
||||||
|
|
||||||
4.MIPS64 CPU:
|
4.MIPS64 CPU:
|
||||||
SICORTEX
|
SICORTEX
|
||||||
|
|
|
@ -49,3 +49,23 @@ jobs:
|
||||||
# we need a privileged docker run for sde process attachment
|
# we need a privileged docker run for sde process attachment
|
||||||
docker run --privileged intel_sde
|
docker run --privileged intel_sde
|
||||||
displayName: 'Run AVX512 SkylakeX docker build / test'
|
displayName: 'Run AVX512 SkylakeX docker build / test'
|
||||||
|
|
||||||
|
- job: Windows_cl
|
||||||
|
pool:
|
||||||
|
vmImage: 'windows-latest'
|
||||||
|
steps:
|
||||||
|
- task: CMake@1
|
||||||
|
inputs:
|
||||||
|
workingDirectory: 'build' # Optional
|
||||||
|
cmakeArgs: '-G "Visual Studio 16 2019" ..'
|
||||||
|
- task: CMake@1
|
||||||
|
inputs:
|
||||||
|
cmakeArgs: '--build . --config Release'
|
||||||
|
workingDirectory: 'build'
|
||||||
|
- script: |
|
||||||
|
cd build
|
||||||
|
cd utest
|
||||||
|
dir
|
||||||
|
openblas_utest.exe
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -49,6 +49,12 @@ else
|
||||||
GOTO_LAPACK_TARGETS=
|
GOTO_LAPACK_TARGETS=
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(BUILD_HALF),1)
|
||||||
|
GOTO_HALF_TARGETS=shgemm.goto
|
||||||
|
else
|
||||||
|
GOTO_HALF_TARGETS=
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq ($(OSNAME), WINNT)
|
ifeq ($(OSNAME), WINNT)
|
||||||
|
|
||||||
goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
|
goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
|
||||||
|
@ -91,7 +97,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
|
||||||
sgetri.goto dgetri.goto cgetri.goto zgetri.goto \
|
sgetri.goto dgetri.goto cgetri.goto zgetri.goto \
|
||||||
spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \
|
spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \
|
||||||
ssymm.goto dsymm.goto csymm.goto zsymm.goto \
|
ssymm.goto dsymm.goto csymm.goto zsymm.goto \
|
||||||
saxpby.goto daxpby.goto caxpby.goto zaxpby.goto
|
saxpby.goto daxpby.goto caxpby.goto zaxpby.goto $(GOTO_HALF_TARGETS)
|
||||||
|
|
||||||
acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
|
acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
|
||||||
scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \
|
scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \
|
||||||
|
@ -264,7 +270,7 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \
|
||||||
samin.goto damin.goto camin.goto zamin.goto \
|
samin.goto damin.goto camin.goto zamin.goto \
|
||||||
smin.goto dmin.goto \
|
smin.goto dmin.goto \
|
||||||
saxpby.goto daxpby.goto caxpby.goto zaxpby.goto \
|
saxpby.goto daxpby.goto caxpby.goto zaxpby.goto \
|
||||||
snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS)
|
snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) $(GOTO_HALF_TARGETS)
|
||||||
|
|
||||||
acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
|
acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
|
||||||
scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \
|
scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \
|
||||||
|
@ -614,6 +620,11 @@ zcholesky.essl : zcholesky.$(SUFFIX)
|
||||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
##################################### Sgemm ####################################################
|
##################################### Sgemm ####################################################
|
||||||
|
ifeq ($(BUILD_HALF),1)
|
||||||
|
shgemm.goto : shgemm.$(SUFFIX) ../$(LIBNAME)
|
||||||
|
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||||
|
endif
|
||||||
|
|
||||||
sgemm.goto : sgemm.$(SUFFIX) ../$(LIBNAME)
|
sgemm.goto : sgemm.$(SUFFIX) ../$(LIBNAME)
|
||||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||||
|
|
||||||
|
@ -1814,7 +1825,7 @@ zsymv.veclib : zsymv.$(SUFFIX)
|
||||||
|
|
||||||
##################################### Sgeev ####################################################
|
##################################### Sgeev ####################################################
|
||||||
sgeev.goto : sgeev.$(SUFFIX) ../$(LIBNAME)
|
sgeev.goto : sgeev.$(SUFFIX) ../$(LIBNAME)
|
||||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||||
|
|
||||||
sgeev.acml : sgeev.$(SUFFIX)
|
sgeev.acml : sgeev.$(SUFFIX)
|
||||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
@ -1830,7 +1841,7 @@ sgeev.veclib : sgeev.$(SUFFIX)
|
||||||
|
|
||||||
##################################### Dgeev ####################################################
|
##################################### Dgeev ####################################################
|
||||||
dgeev.goto : dgeev.$(SUFFIX) ../$(LIBNAME)
|
dgeev.goto : dgeev.$(SUFFIX) ../$(LIBNAME)
|
||||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||||
|
|
||||||
dgeev.acml : dgeev.$(SUFFIX)
|
dgeev.acml : dgeev.$(SUFFIX)
|
||||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
@ -1847,7 +1858,7 @@ dgeev.veclib : dgeev.$(SUFFIX)
|
||||||
##################################### Cgeev ####################################################
|
##################################### Cgeev ####################################################
|
||||||
|
|
||||||
cgeev.goto : cgeev.$(SUFFIX) ../$(LIBNAME)
|
cgeev.goto : cgeev.$(SUFFIX) ../$(LIBNAME)
|
||||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||||
|
|
||||||
cgeev.acml : cgeev.$(SUFFIX)
|
cgeev.acml : cgeev.$(SUFFIX)
|
||||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
@ -1864,7 +1875,7 @@ cgeev.veclib : cgeev.$(SUFFIX)
|
||||||
##################################### Zgeev ####################################################
|
##################################### Zgeev ####################################################
|
||||||
|
|
||||||
zgeev.goto : zgeev.$(SUFFIX) ../$(LIBNAME)
|
zgeev.goto : zgeev.$(SUFFIX) ../$(LIBNAME)
|
||||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||||
|
|
||||||
zgeev.acml : zgeev.$(SUFFIX)
|
zgeev.acml : zgeev.$(SUFFIX)
|
||||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
@ -1880,7 +1891,7 @@ zgeev.veclib : zgeev.$(SUFFIX)
|
||||||
|
|
||||||
##################################### Sgetri ####################################################
|
##################################### Sgetri ####################################################
|
||||||
sgetri.goto : sgetri.$(SUFFIX) ../$(LIBNAME)
|
sgetri.goto : sgetri.$(SUFFIX) ../$(LIBNAME)
|
||||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||||
|
|
||||||
sgetri.acml : sgetri.$(SUFFIX)
|
sgetri.acml : sgetri.$(SUFFIX)
|
||||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
@ -1896,7 +1907,7 @@ sgetri.veclib : sgetri.$(SUFFIX)
|
||||||
|
|
||||||
##################################### Dgetri ####################################################
|
##################################### Dgetri ####################################################
|
||||||
dgetri.goto : dgetri.$(SUFFIX) ../$(LIBNAME)
|
dgetri.goto : dgetri.$(SUFFIX) ../$(LIBNAME)
|
||||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||||
|
|
||||||
dgetri.acml : dgetri.$(SUFFIX)
|
dgetri.acml : dgetri.$(SUFFIX)
|
||||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
@ -1913,7 +1924,7 @@ dgetri.veclib : dgetri.$(SUFFIX)
|
||||||
##################################### Cgetri ####################################################
|
##################################### Cgetri ####################################################
|
||||||
|
|
||||||
cgetri.goto : cgetri.$(SUFFIX) ../$(LIBNAME)
|
cgetri.goto : cgetri.$(SUFFIX) ../$(LIBNAME)
|
||||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||||
|
|
||||||
cgetri.acml : cgetri.$(SUFFIX)
|
cgetri.acml : cgetri.$(SUFFIX)
|
||||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
@ -1930,7 +1941,7 @@ cgetri.veclib : cgetri.$(SUFFIX)
|
||||||
##################################### Zgetri ####################################################
|
##################################### Zgetri ####################################################
|
||||||
|
|
||||||
zgetri.goto : zgetri.$(SUFFIX) ../$(LIBNAME)
|
zgetri.goto : zgetri.$(SUFFIX) ../$(LIBNAME)
|
||||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||||
|
|
||||||
zgetri.acml : zgetri.$(SUFFIX)
|
zgetri.acml : zgetri.$(SUFFIX)
|
||||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
@ -2916,6 +2927,11 @@ ccholesky.$(SUFFIX) : cholesky.c
|
||||||
zcholesky.$(SUFFIX) : cholesky.c
|
zcholesky.$(SUFFIX) : cholesky.c
|
||||||
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
||||||
|
|
||||||
|
ifeq ($(BUILD_HALF),1)
|
||||||
|
shgemm.$(SUFFIX) : gemm.c
|
||||||
|
$(CC) $(CFLAGS) -c -DHALF -UCOMPLEX -UDOUBLE -o $(@F) $^
|
||||||
|
endif
|
||||||
|
|
||||||
sgemm.$(SUFFIX) : gemm.c
|
sgemm.$(SUFFIX) : gemm.c
|
||||||
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
|
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
|
||||||
|
|
||||||
|
|
|
@ -39,6 +39,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#ifdef DOUBLE
|
#ifdef DOUBLE
|
||||||
#define GEMM BLASFUNC(dgemm)
|
#define GEMM BLASFUNC(dgemm)
|
||||||
|
#elif defined(HALF)
|
||||||
|
#define GEMM BLASFUNC(shgemm)
|
||||||
#else
|
#else
|
||||||
#define GEMM BLASFUNC(sgemm)
|
#define GEMM BLASFUNC(sgemm)
|
||||||
#endif
|
#endif
|
||||||
|
@ -120,7 +122,8 @@ static void *huge_malloc(BLASLONG size){
|
||||||
|
|
||||||
int main(int argc, char *argv[]){
|
int main(int argc, char *argv[]){
|
||||||
|
|
||||||
FLOAT *a, *b, *c;
|
IFLOAT *a, *b;
|
||||||
|
FLOAT *c;
|
||||||
FLOAT alpha[] = {1.0, 0.0};
|
FLOAT alpha[] = {1.0, 0.0};
|
||||||
FLOAT beta [] = {0.0, 0.0};
|
FLOAT beta [] = {0.0, 0.0};
|
||||||
char transa = 'N';
|
char transa = 'N';
|
||||||
|
@ -184,10 +187,10 @@ int main(int argc, char *argv[]){
|
||||||
k = to;
|
k = to;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * m * k * COMPSIZE)) == NULL) {
|
if (( a = (IFLOAT *)malloc(sizeof(IFLOAT) * m * k * COMPSIZE)) == NULL) {
|
||||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||||
}
|
}
|
||||||
if (( b = (FLOAT *)malloc(sizeof(FLOAT) * k * n * COMPSIZE)) == NULL) {
|
if (( b = (IFLOAT *)malloc(sizeof(IFLOAT) * k * n * COMPSIZE)) == NULL) {
|
||||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||||
}
|
}
|
||||||
if (( c = (FLOAT *)malloc(sizeof(FLOAT) * m * n * COMPSIZE)) == NULL) {
|
if (( c = (FLOAT *)malloc(sizeof(FLOAT) * m * n * COMPSIZE)) == NULL) {
|
||||||
|
@ -199,10 +202,10 @@ int main(int argc, char *argv[]){
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
for (i = 0; i < m * k * COMPSIZE; i++) {
|
for (i = 0; i < m * k * COMPSIZE; i++) {
|
||||||
a[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
a[i] = ((IFLOAT) rand() / (IFLOAT) RAND_MAX) - 0.5;
|
||||||
}
|
}
|
||||||
for (i = 0; i < k * n * COMPSIZE; i++) {
|
for (i = 0; i < k * n * COMPSIZE; i++) {
|
||||||
b[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
b[i] = ((IFLOAT) rand() / (IFLOAT) RAND_MAX) - 0.5;
|
||||||
}
|
}
|
||||||
for (i = 0; i < m * n * COMPSIZE; i++) {
|
for (i = 0; i < m * n * COMPSIZE; i++) {
|
||||||
c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||||
|
|
|
@ -170,9 +170,11 @@ int main(int argc, char *argv[]){
|
||||||
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||||
}
|
}
|
||||||
gettimeofday( &start, (struct timezone *)0);
|
gettimeofday( &start, (struct timezone *)0);
|
||||||
|
#ifdef RETURN_BY_STACK
|
||||||
|
DOT (&result , &m, x, &inc_x, y, &inc_y );
|
||||||
|
#else
|
||||||
result = DOT (&m, x, &inc_x, y, &inc_y );
|
result = DOT (&m, x, &inc_x, y, &inc_y );
|
||||||
|
#endif
|
||||||
gettimeofday( &stop, (struct timezone *)0);
|
gettimeofday( &stop, (struct timezone *)0);
|
||||||
|
|
||||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||||
|
|
1
c_check
1
c_check
|
@ -310,6 +310,7 @@ $linker_a = "";
|
||||||
&& ($flags !~ /advapi32/)
|
&& ($flags !~ /advapi32/)
|
||||||
&& ($flags !~ /shell32/)
|
&& ($flags !~ /shell32/)
|
||||||
&& ($flags !~ /omp/)
|
&& ($flags !~ /omp/)
|
||||||
|
&& ($flags !~ /[0-9]+/)
|
||||||
) {
|
) {
|
||||||
$linker_l .= $flags . " "
|
$linker_l .= $flags . " "
|
||||||
}
|
}
|
||||||
|
|
|
@ -49,7 +49,7 @@ if (DYNAMIC_ARCH)
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
if (POWER)
|
if (POWER)
|
||||||
set(DYNAMIC_CORE POWER6 POWER8 POWER9)
|
set(DYNAMIC_CORE POWER6 POWER8 POWER9 POWER10)
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
if (X86)
|
if (X86)
|
||||||
|
|
|
@ -16,6 +16,7 @@ if (${F_COMPILER} STREQUAL "FLANG")
|
||||||
if (USE_OPENMP)
|
if (USE_OPENMP)
|
||||||
set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp")
|
set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp")
|
||||||
endif ()
|
endif ()
|
||||||
|
set(FCOMMON_OPT "${FCOMMON_OPT} -Mrecursive -Kieee")
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
if (${F_COMPILER} STREQUAL "G77")
|
if (${F_COMPILER} STREQUAL "G77")
|
||||||
|
|
|
@ -113,11 +113,31 @@ macro(SetDefaultL1)
|
||||||
set(ZSUMKERNEL zsum.S)
|
set(ZSUMKERNEL zsum.S)
|
||||||
set(QSUMKERNEL sum.S)
|
set(QSUMKERNEL sum.S)
|
||||||
set(XSUMKERNEL zsum.S)
|
set(XSUMKERNEL zsum.S)
|
||||||
|
if (BUILD_HALF)
|
||||||
|
set(SHAMINKERNEL ../arm/amin.c)
|
||||||
|
set(SHAMAXKERNEL ../arm/amax.c)
|
||||||
|
set(SHMAXKERNEL ../arm/max.c)
|
||||||
|
set(SHMINKERNEL ../arm/min.c)
|
||||||
|
set(ISHAMAXKERNEL ../arm/iamax.c)
|
||||||
|
set(ISHAMINKERNEL ../arm/iamin.c)
|
||||||
|
set(ISHMAXKERNEL ../arm/imax.c)
|
||||||
|
set(ISHMINKERNEL ../arm/imin.c)
|
||||||
|
set(SHASUMKERNEL ../arm/asum.c)
|
||||||
|
set(SHAXPYKERNEL ../arm/axpy.c)
|
||||||
|
set(SHAXPBYKERNEL ../arm/axpby.c)
|
||||||
|
set(SHCOPYKERNEL ../arm/copy.c)
|
||||||
|
set(SHDOTKERNEL ../arm/dot.c)
|
||||||
|
set(SHROTKERNEL ../arm/rot.c)
|
||||||
|
set(SHSCALKERNEL ../arm/scal.c)
|
||||||
|
set(SHNRM2KERNEL ../arm/nrm2.c)
|
||||||
|
set(SHSUMKERNEL ../arm/sum.c)
|
||||||
|
set(SHSWAPKERNEL ../arm/swap.c)
|
||||||
|
endif ()
|
||||||
endmacro ()
|
endmacro ()
|
||||||
|
|
||||||
macro(SetDefaultL2)
|
macro(SetDefaultL2)
|
||||||
set(SGEMVNKERNEL gemv_n.S)
|
set(SGEMVNKERNEL ../arm/gemv_n.c)
|
||||||
set(SGEMVTKERNEL gemv_t.S)
|
set(SGEMVTKERNEL ../arm/gemv_t.c)
|
||||||
set(DGEMVNKERNEL gemv_n.S)
|
set(DGEMVNKERNEL gemv_n.S)
|
||||||
set(DGEMVTKERNEL gemv_t.S)
|
set(DGEMVTKERNEL gemv_t.S)
|
||||||
set(CGEMVNKERNEL zgemv_n.S)
|
set(CGEMVNKERNEL zgemv_n.S)
|
||||||
|
@ -161,6 +181,11 @@ macro(SetDefaultL2)
|
||||||
set(XHEMV_L_KERNEL ../generic/zhemv_k.c)
|
set(XHEMV_L_KERNEL ../generic/zhemv_k.c)
|
||||||
set(XHEMV_V_KERNEL ../generic/zhemv_k.c)
|
set(XHEMV_V_KERNEL ../generic/zhemv_k.c)
|
||||||
set(XHEMV_M_KERNEL ../generic/zhemv_k.c)
|
set(XHEMV_M_KERNEL ../generic/zhemv_k.c)
|
||||||
|
if (BUILD_HALF)
|
||||||
|
set(SHGEMVNKERNEL ../arm/gemv_n.c)
|
||||||
|
set(SHGEMVTKERNEL ../arm/gemv_t.c)
|
||||||
|
set(SHGERKERNEL ../generic/ger.c)
|
||||||
|
endif ()
|
||||||
endmacro ()
|
endmacro ()
|
||||||
|
|
||||||
macro(SetDefaultL3)
|
macro(SetDefaultL3)
|
||||||
|
@ -168,4 +193,18 @@ macro(SetDefaultL3)
|
||||||
set(DGEADD_KERNEL ../generic/geadd.c)
|
set(DGEADD_KERNEL ../generic/geadd.c)
|
||||||
set(CGEADD_KERNEL ../generic/zgeadd.c)
|
set(CGEADD_KERNEL ../generic/zgeadd.c)
|
||||||
set(ZGEADD_KERNEL ../generic/zgeadd.c)
|
set(ZGEADD_KERNEL ../generic/zgeadd.c)
|
||||||
|
if (BUILD_HALF)
|
||||||
|
set(SHGEADD_KERNEL ../generic/geadd.c)
|
||||||
|
set(SHGEMMKERNEL ../generic/gemmkernel_2x2.c)
|
||||||
|
set(SHGEMM_BETA ../generic/gemm_beta.c)
|
||||||
|
set(SHGEMMINCOPY ../generic/gemm_ncopy_2.c)
|
||||||
|
set(SHGEMMITCOPY ../generic/gemm_tcopy_2.c)
|
||||||
|
set(SHGEMMONCOPY ../generic/gemm_ncopy_2.c)
|
||||||
|
set(SHGEMMOTCOPY ../generic/gemm_tcopy_2.c)
|
||||||
|
set(SHGEMMINCOPYOBJ shgemm_incopy.o)
|
||||||
|
set(SHGEMMITCOPYOBJ shgemm_itcopy.o)
|
||||||
|
set(SHGEMMONCOPYOBJ shgemm_oncopy.o)
|
||||||
|
set(SHGEMMOTCOPYOBJ shgemm_otcopy.o)
|
||||||
|
endif ()
|
||||||
|
|
||||||
endmacro ()
|
endmacro ()
|
||||||
|
|
|
@ -8,7 +8,7 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
|
||||||
set(NO_EXPRECISION 1)
|
set(NO_EXPRECISION 1)
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
if (${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD|OpenBSD|NetBSD|DragonFly")
|
if (${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD|OpenBSD|NetBSD|DragonFly|Darwin")
|
||||||
set(EXTRALIB "${EXTRALIB} -lm")
|
set(EXTRALIB "${EXTRALIB} -lm")
|
||||||
set(NO_EXPRECISION 1)
|
set(NO_EXPRECISION 1)
|
||||||
endif ()
|
endif ()
|
||||||
|
|
|
@ -16,6 +16,8 @@
|
||||||
# HAVE_SSE2
|
# HAVE_SSE2
|
||||||
# HAVE_SSE3
|
# HAVE_SSE3
|
||||||
# MAKE
|
# MAKE
|
||||||
|
# SHGEMM_UNROLL_M
|
||||||
|
# SHGEMM_UNROLL_N
|
||||||
# SGEMM_UNROLL_M
|
# SGEMM_UNROLL_M
|
||||||
# SGEMM_UNROLL_N
|
# SGEMM_UNROLL_N
|
||||||
# DGEMM_UNROLL_M
|
# DGEMM_UNROLL_M
|
||||||
|
@ -418,7 +420,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
|
||||||
set(ZGEMM_UNROLL_M 8)
|
set(ZGEMM_UNROLL_M 8)
|
||||||
set(ZGEMM_UNROLL_N 2)
|
set(ZGEMM_UNROLL_N 2)
|
||||||
set(SYMV_P 8)
|
set(SYMV_P 8)
|
||||||
elseif ("${TCORE}" STREQUAL "POWER9")
|
elseif ("${TCORE}" STREQUAL "POWER9" OR "${TCORE}" STREQUAL "POWER10")
|
||||||
file(APPEND ${TARGET_CONF_TEMP}
|
file(APPEND ${TARGET_CONF_TEMP}
|
||||||
"#define L1_DATA_SIZE 32768\n"
|
"#define L1_DATA_SIZE 32768\n"
|
||||||
"#define L1_DATA_LINESIZE 128\n"
|
"#define L1_DATA_LINESIZE 128\n"
|
||||||
|
@ -437,6 +439,8 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
|
||||||
set(ZGEMM_UNROLL_N 2)
|
set(ZGEMM_UNROLL_N 2)
|
||||||
set(SYMV_P 8)
|
set(SYMV_P 8)
|
||||||
endif()
|
endif()
|
||||||
|
set(SHGEMM_UNROLL_M 8)
|
||||||
|
set(SHGEMM_UNROLL_N 4)
|
||||||
|
|
||||||
# Or should this actually be NUM_CORES?
|
# Or should this actually be NUM_CORES?
|
||||||
if (${NUM_THREADS} GREATER 0)
|
if (${NUM_THREADS} GREATER 0)
|
||||||
|
@ -488,7 +492,7 @@ else(NOT CMAKE_CROSSCOMPILING)
|
||||||
if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
|
if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
|
||||||
try_compile(GETARCH_RESULT ${GETARCH_DIR}
|
try_compile(GETARCH_RESULT ${GETARCH_DIR}
|
||||||
SOURCES ${GETARCH_SRC}
|
SOURCES ${GETARCH_SRC}
|
||||||
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${GETARCH_DIR} -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}"
|
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I"${GETARCH_DIR}" -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}"
|
||||||
OUTPUT_VARIABLE GETARCH_LOG
|
OUTPUT_VARIABLE GETARCH_LOG
|
||||||
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN}
|
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN}
|
||||||
)
|
)
|
||||||
|
@ -516,7 +520,7 @@ execute_process(COMMAND "${PROJECT_BINARY_DIR}/${GETARCH_BIN}" 1 OUTPUT_VARIABLE
|
||||||
if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
|
if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
|
||||||
try_compile(GETARCH2_RESULT ${GETARCH2_DIR}
|
try_compile(GETARCH2_RESULT ${GETARCH2_DIR}
|
||||||
SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c
|
SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c
|
||||||
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${GETARCH2_DIR} -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}"
|
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I"${GETARCH2_DIR}" -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}"
|
||||||
OUTPUT_VARIABLE GETARCH2_LOG
|
OUTPUT_VARIABLE GETARCH2_LOG
|
||||||
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN}
|
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN}
|
||||||
)
|
)
|
||||||
|
|
|
@ -297,6 +297,16 @@ if (USE_SIMPLE_THREADED_LEVEL3)
|
||||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_SIMPLE_THREADED_LEVEL3")
|
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_SIMPLE_THREADED_LEVEL3")
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
|
if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
|
||||||
|
if (DEFINED MAX_STACK_ALLOC)
|
||||||
|
if (NOT ${MAX_STACK_ALLOC} EQUAL 0)
|
||||||
|
set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_STACK_ALLOC=${MAX_STACK_ALLOC}")
|
||||||
|
endif ()
|
||||||
|
else ()
|
||||||
|
set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_STACK_ALLOC=2048")
|
||||||
|
endif ()
|
||||||
|
endif ()
|
||||||
|
|
||||||
if (DEFINED LIBNAMESUFFIX)
|
if (DEFINED LIBNAMESUFFIX)
|
||||||
set(LIBPREFIX "libopenblas_${LIBNAMESUFFIX}")
|
set(LIBPREFIX "libopenblas_${LIBNAMESUFFIX}")
|
||||||
else ()
|
else ()
|
||||||
|
@ -407,6 +417,14 @@ if (${CMAKE_C_COMPILER} STREQUAL "LSB" OR ${CMAKE_SYSTEM_NAME} STREQUAL "Windows
|
||||||
set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DLAPACK_COMPLEX_STRUCTURE")
|
set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DLAPACK_COMPLEX_STRUCTURE")
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
|
if ("${CMAKE_BUILD_TYPE}" STREQUAL "Release")
|
||||||
|
if ("${F_COMPILER}" STREQUAL "FLANG")
|
||||||
|
if (${CMAKE_Fortran_COMPILER_VERSION} VERSION_LESS_EQUAL 3)
|
||||||
|
set(CMAKE_Fortran_FLAGS_RELEASE "${CMAKE_Fortran_FLAGS_RELEASE} -fno-unroll-loops")
|
||||||
|
endif ()
|
||||||
|
endif ()
|
||||||
|
endif ()
|
||||||
|
|
||||||
if (NOT DEFINED SUFFIX)
|
if (NOT DEFINED SUFFIX)
|
||||||
set(SUFFIX o)
|
set(SUFFIX o)
|
||||||
endif ()
|
endif ()
|
||||||
|
@ -530,6 +548,8 @@ endif ()
|
||||||
#export FUNCTION_PROFILE
|
#export FUNCTION_PROFILE
|
||||||
#export TARGET_CORE
|
#export TARGET_CORE
|
||||||
#
|
#
|
||||||
|
#export SHGEMM_UNROLL_M
|
||||||
|
#export SHGEMM_UNROLL_N
|
||||||
#export SGEMM_UNROLL_M
|
#export SGEMM_UNROLL_M
|
||||||
#export SGEMM_UNROLL_N
|
#export SGEMM_UNROLL_N
|
||||||
#export DGEMM_UNROLL_M
|
#export DGEMM_UNROLL_M
|
||||||
|
|
|
@ -15,12 +15,36 @@ endfunction ()
|
||||||
# Reads a Makefile into CMake vars.
|
# Reads a Makefile into CMake vars.
|
||||||
macro(ParseMakefileVars MAKEFILE_IN)
|
macro(ParseMakefileVars MAKEFILE_IN)
|
||||||
message(STATUS "Reading vars from ${MAKEFILE_IN}...")
|
message(STATUS "Reading vars from ${MAKEFILE_IN}...")
|
||||||
|
set (IfElse 0)
|
||||||
|
set (ElseSeen 0)
|
||||||
file(STRINGS ${MAKEFILE_IN} makefile_contents)
|
file(STRINGS ${MAKEFILE_IN} makefile_contents)
|
||||||
foreach (makefile_line ${makefile_contents})
|
foreach (makefile_line ${makefile_contents})
|
||||||
|
#message(STATUS "parsing ${makefile_line}")
|
||||||
|
if (${IfElse} GREATER 0)
|
||||||
|
string(REGEX MATCH "endif[ \t]*" line_match "${makefile_line}")
|
||||||
|
if (NOT "${line_match}" STREQUAL "")
|
||||||
|
# message(STATUS "ENDIF ${makefile_line}")
|
||||||
|
set (IfElse 0)
|
||||||
|
set (ElseSeen 0)
|
||||||
|
continue ()
|
||||||
|
endif ()
|
||||||
|
string(REGEX MATCH "else[ \t]*" line_match "${makefile_line}")
|
||||||
|
if (NOT "${line_match}" STREQUAL "")
|
||||||
|
# message(STATUS "ELSE ${makefile_line}")
|
||||||
|
set (ElseSeen 1)
|
||||||
|
continue ()
|
||||||
|
endif()
|
||||||
|
if ( (${IfElse} EQUAL 2 AND ${ElseSeen} EQUAL 0) OR ( ${IfElse} EQUAL 1 AND ${ElseSeen} EQUAL 1))
|
||||||
|
# message(STATUS "skipping ${makefile_line}")
|
||||||
|
continue ()
|
||||||
|
endif ()
|
||||||
|
endif ()
|
||||||
string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*(.+)$" line_match "${makefile_line}")
|
string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*(.+)$" line_match "${makefile_line}")
|
||||||
if (NOT "${line_match}" STREQUAL "")
|
if (NOT "${line_match}" STREQUAL "")
|
||||||
|
#message(STATUS "match on ${line_match}")
|
||||||
set(var_name ${CMAKE_MATCH_1})
|
set(var_name ${CMAKE_MATCH_1})
|
||||||
set(var_value ${CMAKE_MATCH_2})
|
# set(var_value ${CMAKE_MATCH_2})
|
||||||
|
string(STRIP ${CMAKE_MATCH_2} var_value)
|
||||||
# check for Makefile variables in the string, e.g. $(TSUFFIX)
|
# check for Makefile variables in the string, e.g. $(TSUFFIX)
|
||||||
string(REGEX MATCHALL "\\$\\(([0-9_a-zA-Z]+)\\)" make_var_matches ${var_value})
|
string(REGEX MATCHALL "\\$\\(([0-9_a-zA-Z]+)\\)" make_var_matches ${var_value})
|
||||||
foreach (make_var ${make_var_matches})
|
foreach (make_var ${make_var_matches})
|
||||||
|
@ -33,7 +57,31 @@ macro(ParseMakefileVars MAKEFILE_IN)
|
||||||
else ()
|
else ()
|
||||||
string(REGEX MATCH "include \\$\\(KERNELDIR\\)/(.+)$" line_match "${makefile_line}")
|
string(REGEX MATCH "include \\$\\(KERNELDIR\\)/(.+)$" line_match "${makefile_line}")
|
||||||
if (NOT "${line_match}" STREQUAL "")
|
if (NOT "${line_match}" STREQUAL "")
|
||||||
|
#message(STATUS "match on include ${line_match}")
|
||||||
ParseMakefileVars(${KERNELDIR}/${CMAKE_MATCH_1})
|
ParseMakefileVars(${KERNELDIR}/${CMAKE_MATCH_1})
|
||||||
|
else ()
|
||||||
|
# message(STATUS "unmatched line ${line_match}")
|
||||||
|
string(REGEX MATCH "ifeq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}")
|
||||||
|
if (NOT "${line_match}" STREQUAL "")
|
||||||
|
# message(STATUS "IFEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}")
|
||||||
|
if (DEFINED ${${CMAKE_MATCH_1}} AND ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2})
|
||||||
|
# message (STATUS "condition is true")
|
||||||
|
set (IfElse 1)
|
||||||
|
else ()
|
||||||
|
set (IfElse 2)
|
||||||
|
endif ()
|
||||||
|
else ()
|
||||||
|
string(REGEX MATCH "ifneq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}")
|
||||||
|
if (NOT "${line_match}" STREQUAL "")
|
||||||
|
# message(STATUS "IFNEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}")
|
||||||
|
if (NOT ( ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2}))
|
||||||
|
# message (STATUS "condition is true")
|
||||||
|
set (IfElse 1)
|
||||||
|
else ()
|
||||||
|
set (IfElse 2)
|
||||||
|
endif ()
|
||||||
|
endif ()
|
||||||
|
endif ()
|
||||||
endif ()
|
endif ()
|
||||||
endif ()
|
endif ()
|
||||||
endforeach ()
|
endforeach ()
|
||||||
|
@ -163,6 +211,7 @@ function(GenerateNamedObjects sources_in)
|
||||||
if (complex_only)
|
if (complex_only)
|
||||||
list(REMOVE_ITEM float_list "SINGLE")
|
list(REMOVE_ITEM float_list "SINGLE")
|
||||||
list(REMOVE_ITEM float_list "DOUBLE")
|
list(REMOVE_ITEM float_list "DOUBLE")
|
||||||
|
list(REMOVE_ITEM float_list "HALF")
|
||||||
elseif (real_only)
|
elseif (real_only)
|
||||||
list(REMOVE_ITEM float_list "COMPLEX")
|
list(REMOVE_ITEM float_list "COMPLEX")
|
||||||
list(REMOVE_ITEM float_list "ZCOMPLEX")
|
list(REMOVE_ITEM float_list "ZCOMPLEX")
|
||||||
|
@ -176,6 +225,9 @@ function(GenerateNamedObjects sources_in)
|
||||||
if (NOT no_float_type)
|
if (NOT no_float_type)
|
||||||
string(SUBSTRING ${float_type} 0 1 float_char)
|
string(SUBSTRING ${float_type} 0 1 float_char)
|
||||||
string(TOLOWER ${float_char} float_char)
|
string(TOLOWER ${float_char} float_char)
|
||||||
|
if (${float_type} STREQUAL "HALF")
|
||||||
|
set (float_char "sh")
|
||||||
|
endif ()
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
if (NOT name_in)
|
if (NOT name_in)
|
||||||
|
@ -210,6 +262,9 @@ function(GenerateNamedObjects sources_in)
|
||||||
if (${float_type} STREQUAL "DOUBLE" OR ${float_type} STREQUAL "ZCOMPLEX")
|
if (${float_type} STREQUAL "DOUBLE" OR ${float_type} STREQUAL "ZCOMPLEX")
|
||||||
list(APPEND obj_defines "DOUBLE")
|
list(APPEND obj_defines "DOUBLE")
|
||||||
endif ()
|
endif ()
|
||||||
|
if (${float_type} STREQUAL "HALF")
|
||||||
|
list(APPEND obj_defines "HALF")
|
||||||
|
endif ()
|
||||||
if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX")
|
if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX")
|
||||||
list(APPEND obj_defines "COMPLEX")
|
list(APPEND obj_defines "COMPLEX")
|
||||||
if (mangle_complex_sources)
|
if (mangle_complex_sources)
|
||||||
|
|
23
common.h
23
common.h
|
@ -257,6 +257,11 @@ typedef long BLASLONG;
|
||||||
typedef unsigned long BLASULONG;
|
typedef unsigned long BLASULONG;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifndef BFLOAT16
|
||||||
|
typedef unsigned short bfloat16;
|
||||||
|
#define HALFCONVERSION 1
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef USE64BITINT
|
#ifdef USE64BITINT
|
||||||
typedef BLASLONG blasint;
|
typedef BLASLONG blasint;
|
||||||
#if defined(OS_WINDOWS) && defined(__64BIT__)
|
#if defined(OS_WINDOWS) && defined(__64BIT__)
|
||||||
|
@ -297,6 +302,13 @@ typedef int blasint;
|
||||||
#define SIZE 8
|
#define SIZE 8
|
||||||
#define BASE_SHIFT 3
|
#define BASE_SHIFT 3
|
||||||
#define ZBASE_SHIFT 4
|
#define ZBASE_SHIFT 4
|
||||||
|
#elif defined(HALF)
|
||||||
|
#define IFLOAT bfloat16
|
||||||
|
#define XFLOAT IFLOAT
|
||||||
|
#define FLOAT float
|
||||||
|
#define SIZE 2
|
||||||
|
#define BASE_SHIFT 1
|
||||||
|
#define ZBASE_SHIFT 2
|
||||||
#else
|
#else
|
||||||
#define FLOAT float
|
#define FLOAT float
|
||||||
#define SIZE 4
|
#define SIZE 4
|
||||||
|
@ -308,6 +320,10 @@ typedef int blasint;
|
||||||
#define XFLOAT FLOAT
|
#define XFLOAT FLOAT
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifndef IFLOAT
|
||||||
|
#define IFLOAT FLOAT
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifndef COMPLEX
|
#ifndef COMPLEX
|
||||||
#define COMPSIZE 1
|
#define COMPSIZE 1
|
||||||
#else
|
#else
|
||||||
|
@ -344,13 +360,8 @@ typedef int blasint;
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef POWER8
|
|
||||||
#ifndef YIELDING
|
|
||||||
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef POWER9
|
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||||
#ifndef YIELDING
|
#ifndef YIELDING
|
||||||
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
|
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -469,6 +469,8 @@ void BLASFUNC(xhbmv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint
|
||||||
|
|
||||||
/* Level 3 routines */
|
/* Level 3 routines */
|
||||||
|
|
||||||
|
void BLASFUNC(shgemm)(char *, char *, blasint *, blasint *, blasint *, float *,
|
||||||
|
bfloat16 *, blasint *, bfloat16 *, blasint *, float *, float *, blasint *);
|
||||||
void BLASFUNC(sgemm)(char *, char *, blasint *, blasint *, blasint *, float *,
|
void BLASFUNC(sgemm)(char *, char *, blasint *, blasint *, blasint *, float *,
|
||||||
float *, blasint *, float *, blasint *, float *, float *, blasint *);
|
float *, blasint *, float *, blasint *, float *, float *, blasint *);
|
||||||
void BLASFUNC(dgemm)(char *, char *, blasint *, blasint *, blasint *, double *,
|
void BLASFUNC(dgemm)(char *, char *, blasint *, blasint *, blasint *, double *,
|
||||||
|
|
|
@ -55,6 +55,8 @@ extern void sgemm_kernel_direct(BLASLONG M, BLASLONG N, BLASLONG K,
|
||||||
extern int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K);
|
extern int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K);
|
||||||
|
|
||||||
|
|
||||||
|
int shgemm_beta(BLASLONG, BLASLONG, BLASLONG, float,
|
||||||
|
bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG);
|
||||||
int sgemm_beta(BLASLONG, BLASLONG, BLASLONG, float,
|
int sgemm_beta(BLASLONG, BLASLONG, BLASLONG, float,
|
||||||
float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||||
int dgemm_beta(BLASLONG, BLASLONG, BLASLONG, double,
|
int dgemm_beta(BLASLONG, BLASLONG, BLASLONG, double,
|
||||||
|
@ -76,6 +78,10 @@ int xgemm_beta(BLASLONG, BLASLONG, BLASLONG, xdouble *,
|
||||||
xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
|
xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
int shgemm_incopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b);
|
||||||
|
int shgemm_itcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b);
|
||||||
|
int shgemm_oncopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b);
|
||||||
|
int shgemm_otcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b);
|
||||||
int sgemm_incopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b);
|
int sgemm_incopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b);
|
||||||
int sgemm_itcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b);
|
int sgemm_itcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b);
|
||||||
int sgemm_oncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b);
|
int sgemm_oncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b);
|
||||||
|
@ -499,6 +505,7 @@ int xher2k_kernel_UC(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdoubl
|
||||||
int xher2k_kernel_LN(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag);
|
int xher2k_kernel_LN(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag);
|
||||||
int xher2k_kernel_LC(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag);
|
int xher2k_kernel_LC(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag);
|
||||||
|
|
||||||
|
int shgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG);
|
||||||
int sgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG);
|
int sgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG);
|
||||||
int dgemm_kernel(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG);
|
int dgemm_kernel(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG);
|
||||||
|
|
||||||
|
@ -527,6 +534,11 @@ int cgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float
|
||||||
int zgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG);
|
int zgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG);
|
||||||
int xgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG);
|
int xgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
|
||||||
|
int shgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
|
||||||
|
int shgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
|
||||||
|
int shgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
|
||||||
|
int shgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
|
||||||
|
|
||||||
int sgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
int sgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
int sgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
int sgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
int sgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
int sgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
@ -619,6 +631,11 @@ int xgemm_cr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLON
|
||||||
int xgemm_cc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
int xgemm_cc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
int shgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
|
||||||
|
int shgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
|
||||||
|
int shgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
|
||||||
|
int shgemm_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
|
||||||
|
|
||||||
int sgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
int sgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
int sgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
int sgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
int sgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
int sgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
|
292
common_macro.h
292
common_macro.h
|
@ -39,6 +39,7 @@
|
||||||
#ifndef COMMON_MACRO
|
#ifndef COMMON_MACRO
|
||||||
#define COMMON_MACRO
|
#define COMMON_MACRO
|
||||||
|
|
||||||
|
#include "common_sh.h"
|
||||||
#include "common_s.h"
|
#include "common_s.h"
|
||||||
#include "common_d.h"
|
#include "common_d.h"
|
||||||
#include "common_q.h"
|
#include "common_q.h"
|
||||||
|
@ -642,6 +643,288 @@
|
||||||
#define IMATCOPY_K_RT DIMATCOPY_K_RT
|
#define IMATCOPY_K_RT DIMATCOPY_K_RT
|
||||||
|
|
||||||
#define GEADD_K DGEADD_K
|
#define GEADD_K DGEADD_K
|
||||||
|
|
||||||
|
#elif defined(HALF)
|
||||||
|
|
||||||
|
#define AMAX_K SAMAX_K
|
||||||
|
#define AMIN_K SAMIN_K
|
||||||
|
#define MAX_K SMAX_K
|
||||||
|
#define MIN_K SMIN_K
|
||||||
|
#define IAMAX_K ISAMAX_K
|
||||||
|
#define IAMIN_K ISAMIN_K
|
||||||
|
#define IMAX_K ISMAX_K
|
||||||
|
#define IMIN_K ISMIN_K
|
||||||
|
#define ASUM_K SASUM_K
|
||||||
|
#define DOTU_K SDOTU_K
|
||||||
|
#define DOTC_K SDOTC_K
|
||||||
|
#define AXPYU_K SAXPYU_K
|
||||||
|
#define AXPYC_K SAXPYC_K
|
||||||
|
#define AXPBY_K SAXPBY_K
|
||||||
|
#define SCAL_K SSCAL_K
|
||||||
|
#define GEMV_N SGEMV_N
|
||||||
|
#define GEMV_T SGEMV_T
|
||||||
|
#define SYMV_U SSYMV_U
|
||||||
|
#define SYMV_L SSYMV_L
|
||||||
|
#define GERU_K SGERU_K
|
||||||
|
#define GERC_K SGERC_K
|
||||||
|
#define GERV_K SGERV_K
|
||||||
|
#define GERD_K SGERD_K
|
||||||
|
#define SUM_K SSUM_K
|
||||||
|
#define SWAP_K SSWAP_K
|
||||||
|
#define ROT_K SROT_K
|
||||||
|
#define COPY_K SCOPY_K
|
||||||
|
#define NRM2_K SNRM2_K
|
||||||
|
#define SYMV_THREAD_U SSYMV_THREAD_U
|
||||||
|
#define SYMV_THREAD_L SSYMV_THREAD_L
|
||||||
|
#define GEMM_BETA SHGEMM_BETA
|
||||||
|
#define GEMM_KERNEL_N SHGEMM_KERNEL
|
||||||
|
#define GEMM_KERNEL_L SHGEMM_KERNEL
|
||||||
|
#define GEMM_KERNEL_R SHGEMM_KERNEL
|
||||||
|
#define GEMM_KERNEL_B SHGEMM_KERNEL
|
||||||
|
|
||||||
|
#define GEMM_NN SHGEMM_NN
|
||||||
|
#define GEMM_CN SHGEMM_TN
|
||||||
|
#define GEMM_TN SHGEMM_TN
|
||||||
|
#define GEMM_NC SHGEMM_NT
|
||||||
|
#define GEMM_NT SHGEMM_NT
|
||||||
|
#define GEMM_CC SHGEMM_TT
|
||||||
|
#define GEMM_CT SHGEMM_TT
|
||||||
|
#define GEMM_TC SHGEMM_TT
|
||||||
|
#define GEMM_TT SHGEMM_TT
|
||||||
|
#define GEMM_NR SHGEMM_NN
|
||||||
|
#define GEMM_TR SHGEMM_TN
|
||||||
|
#define GEMM_CR SHGEMM_TN
|
||||||
|
#define GEMM_RN SHGEMM_NN
|
||||||
|
#define GEMM_RT SHGEMM_NT
|
||||||
|
#define GEMM_RC SHGEMM_NT
|
||||||
|
#define GEMM_RR SHGEMM_NN
|
||||||
|
#define GEMM_ONCOPY SHGEMM_ONCOPY
|
||||||
|
#define GEMM_OTCOPY SHGEMM_OTCOPY
|
||||||
|
#define GEMM_INCOPY SHGEMM_INCOPY
|
||||||
|
#define GEMM_ITCOPY SHGEMM_ITCOPY
|
||||||
|
#define SYMM_THREAD_LU SSYMM_THREAD_LU
|
||||||
|
#define SYMM_THREAD_LL SSYMM_THREAD_LL
|
||||||
|
#define SYMM_THREAD_RU SSYMM_THREAD_RU
|
||||||
|
#define SYMM_THREAD_RL SSYMM_THREAD_RL
|
||||||
|
#define SYMM_LU SSYMM_LU
|
||||||
|
#define SYMM_LL SSYMM_LL
|
||||||
|
#define SYMM_RU SSYMM_RU
|
||||||
|
#define SYMM_RL SSYMM_RL
|
||||||
|
|
||||||
|
|
||||||
|
#define HEMM_THREAD_LU SHEMM_THREAD_LU
|
||||||
|
#define HEMM_THREAD_LL SHEMM_THREAD_LL
|
||||||
|
#define HEMM_THREAD_RU SHEMM_THREAD_RU
|
||||||
|
#define HEMM_THREAD_RL SHEMM_THREAD_RL
|
||||||
|
|
||||||
|
#define GEMM_THREAD_NN SHGEMM_THREAD_NN
|
||||||
|
#define GEMM_THREAD_CN SHGEMM_THREAD_TN
|
||||||
|
#define GEMM_THREAD_TN SHGEMM_THREAD_TN
|
||||||
|
#define GEMM_THREAD_NC SHGEMM_THREAD_NT
|
||||||
|
#define GEMM_THREAD_NT SHGEMM_THREAD_NT
|
||||||
|
#define GEMM_THREAD_CC SHGEMM_THREAD_TT
|
||||||
|
#define GEMM_THREAD_CT SHGEMM_THREAD_TT
|
||||||
|
#define GEMM_THREAD_TC SHGEMM_THREAD_TT
|
||||||
|
#define GEMM_THREAD_TT SHGEMM_THREAD_TT
|
||||||
|
#define GEMM_THREAD_NR SHGEMM_THREAD_NN
|
||||||
|
#define GEMM_THREAD_TR SHGEMM_THREAD_TN
|
||||||
|
#define GEMM_THREAD_CR SHGEMM_THREAD_TN
|
||||||
|
#define GEMM_THREAD_RN SHGEMM_THREAD_NN
|
||||||
|
#define GEMM_THREAD_RT SHGEMM_THREAD_NT
|
||||||
|
#define GEMM_THREAD_RC SHGEMM_THREAD_NT
|
||||||
|
#define GEMM_THREAD_RR SHGEMM_THREAD_NN
|
||||||
|
|
||||||
|
#ifdef UNIT
|
||||||
|
|
||||||
|
#define TRMM_OUNCOPY STRMM_OUNUCOPY
|
||||||
|
#define TRMM_OUTCOPY STRMM_OUTUCOPY
|
||||||
|
#define TRMM_OLNCOPY STRMM_OLNUCOPY
|
||||||
|
#define TRMM_OLTCOPY STRMM_OLTUCOPY
|
||||||
|
#define TRSM_OUNCOPY STRSM_OUNUCOPY
|
||||||
|
#define TRSM_OUTCOPY STRSM_OUTUCOPY
|
||||||
|
#define TRSM_OLNCOPY STRSM_OLNUCOPY
|
||||||
|
#define TRSM_OLTCOPY STRSM_OLTUCOPY
|
||||||
|
|
||||||
|
#define TRMM_IUNCOPY STRMM_IUNUCOPY
|
||||||
|
#define TRMM_IUTCOPY STRMM_IUTUCOPY
|
||||||
|
#define TRMM_ILNCOPY STRMM_ILNUCOPY
|
||||||
|
#define TRMM_ILTCOPY STRMM_ILTUCOPY
|
||||||
|
#define TRSM_IUNCOPY STRSM_IUNUCOPY
|
||||||
|
#define TRSM_IUTCOPY STRSM_IUTUCOPY
|
||||||
|
#define TRSM_ILNCOPY STRSM_ILNUCOPY
|
||||||
|
#define TRSM_ILTCOPY STRSM_ILTUCOPY
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#define TRMM_OUNCOPY STRMM_OUNNCOPY
|
||||||
|
#define TRMM_OUTCOPY STRMM_OUTNCOPY
|
||||||
|
#define TRMM_OLNCOPY STRMM_OLNNCOPY
|
||||||
|
#define TRMM_OLTCOPY STRMM_OLTNCOPY
|
||||||
|
#define TRSM_OUNCOPY STRSM_OUNNCOPY
|
||||||
|
#define TRSM_OUTCOPY STRSM_OUTNCOPY
|
||||||
|
#define TRSM_OLNCOPY STRSM_OLNNCOPY
|
||||||
|
#define TRSM_OLTCOPY STRSM_OLTNCOPY
|
||||||
|
|
||||||
|
#define TRMM_IUNCOPY STRMM_IUNNCOPY
|
||||||
|
#define TRMM_IUTCOPY STRMM_IUTNCOPY
|
||||||
|
#define TRMM_ILNCOPY STRMM_ILNNCOPY
|
||||||
|
#define TRMM_ILTCOPY STRMM_ILTNCOPY
|
||||||
|
#define TRSM_IUNCOPY STRSM_IUNNCOPY
|
||||||
|
#define TRSM_IUTCOPY STRSM_IUTNCOPY
|
||||||
|
#define TRSM_ILNCOPY STRSM_ILNNCOPY
|
||||||
|
#define TRSM_ILTCOPY STRSM_ILTNCOPY
|
||||||
|
|
||||||
|
#define TRMM_KERNEL_LN STRMM_KERNEL_LN
|
||||||
|
#define TRMM_KERNEL_LT STRMM_KERNEL_LT
|
||||||
|
#define TRMM_KERNEL_LR STRMM_KERNEL_LN
|
||||||
|
#define TRMM_KERNEL_LC STRMM_KERNEL_LT
|
||||||
|
#define TRMM_KERNEL_RN STRMM_KERNEL_RN
|
||||||
|
#define TRMM_KERNEL_RT STRMM_KERNEL_RT
|
||||||
|
#define TRMM_KERNEL_RR STRMM_KERNEL_RN
|
||||||
|
#define TRMM_KERNEL_RC STRMM_KERNEL_RT
|
||||||
|
|
||||||
|
#define TRSM_KERNEL_LN STRSM_KERNEL_LN
|
||||||
|
#define TRSM_KERNEL_LT STRSM_KERNEL_LT
|
||||||
|
#define TRSM_KERNEL_LR STRSM_KERNEL_LN
|
||||||
|
#define TRSM_KERNEL_LC STRSM_KERNEL_LT
|
||||||
|
#define TRSM_KERNEL_RN STRSM_KERNEL_RN
|
||||||
|
#define TRSM_KERNEL_RT STRSM_KERNEL_RT
|
||||||
|
#define TRSM_KERNEL_RR STRSM_KERNEL_RN
|
||||||
|
#define TRSM_KERNEL_RC STRSM_KERNEL_RT
|
||||||
|
|
||||||
|
#define SYMM_IUTCOPY SSYMM_IUTCOPY
|
||||||
|
#define SYMM_ILTCOPY SSYMM_ILTCOPY
|
||||||
|
#define SYMM_OUTCOPY SSYMM_OUTCOPY
|
||||||
|
#define SYMM_OLTCOPY SSYMM_OLTCOPY
|
||||||
|
#define TRMM_LNUU STRMM_LNUU
|
||||||
|
#define TRMM_LNUN STRMM_LNUN
|
||||||
|
#define TRMM_LNLU STRMM_LNLU
|
||||||
|
#define TRMM_LNLN STRMM_LNLN
|
||||||
|
#define TRMM_LTUU STRMM_LTUU
|
||||||
|
#define TRMM_LTUN STRMM_LTUN
|
||||||
|
#define TRMM_LTLU STRMM_LTLU
|
||||||
|
#define TRMM_LTLN STRMM_LTLN
|
||||||
|
#define TRMM_LRUU STRMM_LNUU
|
||||||
|
#define TRMM_LRUN STRMM_LNUN
|
||||||
|
#define TRMM_LRLU STRMM_LNLU
|
||||||
|
#define TRMM_LRLN STRMM_LNLN
|
||||||
|
#define TRMM_LCUU STRMM_LTUU
|
||||||
|
#define TRMM_LCUN STRMM_LTUN
|
||||||
|
#define TRMM_LCLU STRMM_LTLU
|
||||||
|
#define TRMM_LCLN STRMM_LTLN
|
||||||
|
#define TRMM_RNUU STRMM_RNUU
|
||||||
|
#define TRMM_RNUN STRMM_RNUN
|
||||||
|
#define TRMM_RNLU STRMM_RNLU
|
||||||
|
#define TRMM_RNLN STRMM_RNLN
|
||||||
|
#define TRMM_RTUU STRMM_RTUU
|
||||||
|
#define TRMM_RTUN STRMM_RTUN
|
||||||
|
#define TRMM_RTLU STRMM_RTLU
|
||||||
|
#define TRMM_RTLN STRMM_RTLN
|
||||||
|
#define TRMM_RRUU STRMM_RNUU
|
||||||
|
#define TRMM_RRUN STRMM_RNUN
|
||||||
|
#define TRMM_RRLU STRMM_RNLU
|
||||||
|
#define TRMM_RRLN STRMM_RNLN
|
||||||
|
#define TRMM_RCUU STRMM_RTUU
|
||||||
|
#define TRMM_RCUN STRMM_RTUN
|
||||||
|
#define TRMM_RCLU STRMM_RTLU
|
||||||
|
#define TRMM_RCLN STRMM_RTLN
|
||||||
|
|
||||||
|
#define TRSM_LNUU STRSM_LNUU
|
||||||
|
#define TRSM_LNUN STRSM_LNUN
|
||||||
|
#define TRSM_LNLU STRSM_LNLU
|
||||||
|
#define TRSM_LNLN STRSM_LNLN
|
||||||
|
#define TRSM_LTUU STRSM_LTUU
|
||||||
|
#define TRSM_LTUN STRSM_LTUN
|
||||||
|
#define TRSM_LTLU STRSM_LTLU
|
||||||
|
#define TRSM_LTLN STRSM_LTLN
|
||||||
|
#define TRSM_LRUU STRSM_LNUU
|
||||||
|
#define TRSM_LRUN STRSM_LNUN
|
||||||
|
#define TRSM_LRLU STRSM_LNLU
|
||||||
|
#define TRSM_LRLN STRSM_LNLN
|
||||||
|
#define TRSM_LCUU STRSM_LTUU
|
||||||
|
#define TRSM_LCUN STRSM_LTUN
|
||||||
|
#define TRSM_LCLU STRSM_LTLU
|
||||||
|
#define TRSM_LCLN STRSM_LTLN
|
||||||
|
#define TRSM_RNUU STRSM_RNUU
|
||||||
|
#define TRSM_RNUN STRSM_RNUN
|
||||||
|
#define TRSM_RNLU STRSM_RNLU
|
||||||
|
#define TRSM_RNLN STRSM_RNLN
|
||||||
|
#define TRSM_RTUU STRSM_RTUU
|
||||||
|
#define TRSM_RTUN STRSM_RTUN
|
||||||
|
#define TRSM_RTLU STRSM_RTLU
|
||||||
|
#define TRSM_RTLN STRSM_RTLN
|
||||||
|
#define TRSM_RRUU STRSM_RNUU
|
||||||
|
#define TRSM_RRUN STRSM_RNUN
|
||||||
|
#define TRSM_RRLU STRSM_RNLU
|
||||||
|
#define TRSM_RRLN STRSM_RNLN
|
||||||
|
#define TRSM_RCUU STRSM_RTUU
|
||||||
|
#define TRSM_RCUN STRSM_RTUN
|
||||||
|
#define TRSM_RCLU STRSM_RTLU
|
||||||
|
#define TRSM_RCLN STRSM_RTLN
|
||||||
|
#define SYRK_UN SSYRK_UN
|
||||||
|
#define SYRK_UT SSYRK_UT
|
||||||
|
#define SYRK_LN SSYRK_LN
|
||||||
|
#define SYRK_LT SSYRK_LT
|
||||||
|
#define SYRK_UR SSYRK_UN
|
||||||
|
#define SYRK_UC SSYRK_UT
|
||||||
|
#define SYRK_LR SSYRK_LN
|
||||||
|
#define SYRK_LC SSYRK_LT
|
||||||
|
|
||||||
|
#define SYRK_KERNEL_U SSYRK_KERNEL_U
|
||||||
|
#define SYRK_KERNEL_L SSYRK_KERNEL_L
|
||||||
|
|
||||||
|
#define HERK_UN SSYRK_UN
|
||||||
|
#define HERK_LN SSYRK_LN
|
||||||
|
#define HERK_UC SSYRK_UT
|
||||||
|
#define HERK_LC SSYRK_LT
|
||||||
|
|
||||||
|
#define HER2K_UN SSYR2K_UN
|
||||||
|
#define HER2K_LN SSYR2K_LN
|
||||||
|
#define HER2K_UC SSYR2K_UT
|
||||||
|
#define HER2K_LC SSYR2K_LT
|
||||||
|
|
||||||
|
#define SYR2K_UN SSYR2K_UN
|
||||||
|
#define SYR2K_UT SSYR2K_UT
|
||||||
|
#define SYR2K_LN SSYR2K_LN
|
||||||
|
#define SYR2K_LT SSYR2K_LT
|
||||||
|
#define SYR2K_UR SSYR2K_UN
|
||||||
|
#define SYR2K_UC SSYR2K_UT
|
||||||
|
#define SYR2K_LR SSYR2K_LN
|
||||||
|
#define SYR2K_LC SSYR2K_LT
|
||||||
|
|
||||||
|
#define SYR2K_KERNEL_U SSYR2K_KERNEL_U
|
||||||
|
#define SYR2K_KERNEL_L SSYR2K_KERNEL_L
|
||||||
|
#define SYRK_THREAD_UN SSYRK_THREAD_UN
|
||||||
|
#define SYRK_THREAD_UT SSYRK_THREAD_UT
|
||||||
|
#define SYRK_THREAD_LN SSYRK_THREAD_LN
|
||||||
|
#define SYRK_THREAD_LT SSYRK_THREAD_LT
|
||||||
|
#define SYRK_THREAD_UR SSYRK_THREAD_UR
|
||||||
|
#define SYRK_THREAD_UC SSYRK_THREAD_UC
|
||||||
|
#define SYRK_THREAD_LR SSYRK_THREAD_LN
|
||||||
|
#define SYRK_THREAD_LC SSYRK_THREAD_LT
|
||||||
|
|
||||||
|
#define HERK_THREAD_UN SSYRK_THREAD_UN
|
||||||
|
#define HERK_THREAD_UT SSYRK_THREAD_UT
|
||||||
|
#define HERK_THREAD_LN SSYRK_THREAD_LN
|
||||||
|
#define HERK_THREAD_LT SSYRK_THREAD_LT
|
||||||
|
#define HERK_THREAD_UR SSYRK_THREAD_UR
|
||||||
|
#define HERK_THREAD_UC SSYRK_THREAD_UC
|
||||||
|
#define HERK_THREAD_LR SSYRK_THREAD_LN
|
||||||
|
#define HERK_THREAD_LC SSYRK_THREAD_LT
|
||||||
|
|
||||||
|
#define OMATCOPY_K_CN SOMATCOPY_K_CN
|
||||||
|
#define OMATCOPY_K_RN SOMATCOPY_K_RN
|
||||||
|
#define OMATCOPY_K_CT SOMATCOPY_K_CT
|
||||||
|
#define OMATCOPY_K_RT SOMATCOPY_K_RT
|
||||||
|
#define IMATCOPY_K_CN SIMATCOPY_K_CN
|
||||||
|
#define IMATCOPY_K_RN SIMATCOPY_K_RN
|
||||||
|
#define IMATCOPY_K_CT SIMATCOPY_K_CT
|
||||||
|
#define IMATCOPY_K_RT SIMATCOPY_K_RT
|
||||||
|
|
||||||
|
#define GEADD_K SGEADD_K
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
#define AMAX_K SAMAX_K
|
#define AMAX_K SAMAX_K
|
||||||
|
@ -673,14 +956,14 @@
|
||||||
#define GEMV_S SGEMV_S
|
#define GEMV_S SGEMV_S
|
||||||
#define GEMV_D SGEMV_D
|
#define GEMV_D SGEMV_D
|
||||||
|
|
||||||
|
|
||||||
|
#define SYMV_U SSYMV_U
|
||||||
|
#define SYMV_L SSYMV_L
|
||||||
#define GERU_K SGERU_K
|
#define GERU_K SGERU_K
|
||||||
#define GERC_K SGERC_K
|
#define GERC_K SGERC_K
|
||||||
#define GERV_K SGERV_K
|
#define GERV_K SGERV_K
|
||||||
#define GERD_K SGERD_K
|
#define GERD_K SGERD_K
|
||||||
|
|
||||||
#define SYMV_U SSYMV_U
|
|
||||||
#define SYMV_L SSYMV_L
|
|
||||||
|
|
||||||
#define SYMV_THREAD_U SSYMV_THREAD_U
|
#define SYMV_THREAD_U SSYMV_THREAD_U
|
||||||
#define SYMV_THREAD_L SSYMV_THREAD_L
|
#define SYMV_THREAD_L SSYMV_THREAD_L
|
||||||
|
|
||||||
|
@ -2202,6 +2485,9 @@
|
||||||
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)
|
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)
|
||||||
extern BLASLONG gemm_offset_a;
|
extern BLASLONG gemm_offset_a;
|
||||||
extern BLASLONG gemm_offset_b;
|
extern BLASLONG gemm_offset_b;
|
||||||
|
extern BLASLONG shgemm_p;
|
||||||
|
extern BLASLONG shgemm_q;
|
||||||
|
extern BLASLONG shgemm_r;
|
||||||
extern BLASLONG sgemm_p;
|
extern BLASLONG sgemm_p;
|
||||||
extern BLASLONG sgemm_q;
|
extern BLASLONG sgemm_q;
|
||||||
extern BLASLONG sgemm_r;
|
extern BLASLONG sgemm_r;
|
||||||
|
|
|
@ -43,6 +43,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#ifndef ASSEMBLER
|
#ifndef ASSEMBLER
|
||||||
|
|
||||||
|
#if !defined(MIPS24K)
|
||||||
static inline unsigned int rpcc(void){
|
static inline unsigned int rpcc(void){
|
||||||
unsigned long ret;
|
unsigned long ret;
|
||||||
|
|
||||||
|
@ -53,6 +54,7 @@ static inline unsigned int rpcc(void){
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
#define RPCC_DEFINED
|
#define RPCC_DEFINED
|
||||||
|
#endif
|
||||||
|
|
||||||
static inline int blas_quickdivide(blasint x, blasint y){
|
static inline int blas_quickdivide(blasint x, blasint y){
|
||||||
return x / y;
|
return x / y;
|
||||||
|
@ -92,7 +94,7 @@ REALNAME:
|
||||||
#endif
|
#endif
|
||||||
#define HUGE_PAGESIZE ( 4 << 20)
|
#define HUGE_PAGESIZE ( 4 << 20)
|
||||||
|
|
||||||
#define BUFFER_SIZE (16 << 20)
|
#define BUFFER_SIZE (16 << 21)
|
||||||
|
|
||||||
|
|
||||||
#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER)
|
#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER)
|
||||||
|
|
|
@ -227,7 +227,7 @@ REALNAME: ;\
|
||||||
|
|
||||||
#define SEEK_ADDRESS
|
#define SEEK_ADDRESS
|
||||||
|
|
||||||
#define BUFFER_SIZE ( 32 << 20)
|
#define BUFFER_SIZE ( 32 << 21)
|
||||||
|
|
||||||
#if defined(LOONGSON3A)
|
#if defined(LOONGSON3A)
|
||||||
#define PAGESIZE (16UL << 10)
|
#define PAGESIZE (16UL << 10)
|
||||||
|
|
145
common_param.h
145
common_param.h
|
@ -47,6 +47,100 @@ typedef struct {
|
||||||
int dtb_entries;
|
int dtb_entries;
|
||||||
int offsetA, offsetB, align;
|
int offsetA, offsetB, align;
|
||||||
|
|
||||||
|
#ifdef BUILD_HALF
|
||||||
|
int shgemm_p, shgemm_q, shgemm_r;
|
||||||
|
int shgemm_unroll_m, shgemm_unroll_n, shgemm_unroll_mn;
|
||||||
|
|
||||||
|
float (*shamax_k) (BLASLONG, float *, BLASLONG);
|
||||||
|
float (*shamin_k) (BLASLONG, float *, BLASLONG);
|
||||||
|
float (*shmax_k) (BLASLONG, float *, BLASLONG);
|
||||||
|
float (*shmin_k) (BLASLONG, float *, BLASLONG);
|
||||||
|
BLASLONG (*ishamax_k)(BLASLONG, float *, BLASLONG);
|
||||||
|
BLASLONG (*ishamin_k)(BLASLONG, float *, BLASLONG);
|
||||||
|
BLASLONG (*ishmax_k) (BLASLONG, float *, BLASLONG);
|
||||||
|
BLASLONG (*ishmin_k) (BLASLONG, float *, BLASLONG);
|
||||||
|
|
||||||
|
float (*shnrm2_k) (BLASLONG, float *, BLASLONG);
|
||||||
|
float (*shasum_k) (BLASLONG, float *, BLASLONG);
|
||||||
|
float (*shsum_k) (BLASLONG, float *, BLASLONG);
|
||||||
|
int (*shcopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||||
|
float (*shdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||||
|
double (*dshdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||||
|
|
||||||
|
int (*shrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float);
|
||||||
|
|
||||||
|
int (*shaxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||||
|
int (*shscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||||
|
int (*shswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||||
|
|
||||||
|
int (*shgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
|
||||||
|
int (*shgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
|
||||||
|
int (*shger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
|
||||||
|
|
||||||
|
int (*shsymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
|
||||||
|
int (*shsymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
|
||||||
|
|
||||||
|
int (*shgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG);
|
||||||
|
int (*shgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG);
|
||||||
|
|
||||||
|
int (*shgemm_incopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *);
|
||||||
|
int (*shgemm_itcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *);
|
||||||
|
int (*shgemm_oncopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *);
|
||||||
|
int (*shgemm_otcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *);
|
||||||
|
|
||||||
|
int (*shtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
|
||||||
|
int (*shtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
|
||||||
|
int (*shtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
|
||||||
|
int (*shtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
|
||||||
|
|
||||||
|
int (*shtrsm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||||
|
int (*shtrsm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||||
|
int (*shtrsm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||||
|
int (*shtrsm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||||
|
int (*shtrsm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||||
|
int (*shtrsm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||||
|
int (*shtrsm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||||
|
int (*shtrsm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||||
|
int (*shtrsm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||||
|
int (*shtrsm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||||
|
int (*shtrsm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||||
|
int (*shtrsm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||||
|
int (*shtrsm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||||
|
int (*shtrsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||||
|
int (*shtrsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||||
|
int (*shtrsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||||
|
|
||||||
|
int (*shtrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
|
||||||
|
int (*shtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
|
||||||
|
int (*shtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
|
||||||
|
int (*shtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
|
||||||
|
|
||||||
|
int (*shtrmm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||||
|
int (*shtrmm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||||
|
int (*shtrmm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||||
|
int (*shtrmm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||||
|
int (*shtrmm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||||
|
int (*shtrmm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||||
|
int (*shtrmm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||||
|
int (*shtrmm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||||
|
int (*shtrmm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||||
|
int (*shtrmm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||||
|
int (*shtrmm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||||
|
int (*shtrmm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||||
|
int (*shtrmm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||||
|
int (*shtrmm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||||
|
int (*shtrmm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||||
|
int (*shtrmm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||||
|
|
||||||
|
int (*shsymm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||||
|
int (*shsymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||||
|
int (*shsymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||||
|
int (*shsymm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||||
|
|
||||||
|
int (*shneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *);
|
||||||
|
int (*shlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *);
|
||||||
|
|
||||||
|
#endif
|
||||||
int sgemm_p, sgemm_q, sgemm_r;
|
int sgemm_p, sgemm_q, sgemm_r;
|
||||||
int sgemm_unroll_m, sgemm_unroll_n, sgemm_unroll_mn;
|
int sgemm_unroll_m, sgemm_unroll_n, sgemm_unroll_mn;
|
||||||
|
|
||||||
|
@ -84,6 +178,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
|
||||||
int (*sgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG);
|
int (*sgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG);
|
||||||
int (*sgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
int (*sgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||||
|
|
||||||
|
|
||||||
int (*sgemm_incopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
|
int (*sgemm_incopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
|
||||||
int (*sgemm_itcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
|
int (*sgemm_itcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
|
||||||
int (*sgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
|
int (*sgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
|
||||||
|
@ -907,6 +1002,15 @@ extern gotoblas_t *gotoblas;
|
||||||
|
|
||||||
#define HAVE_EX_L2 gotoblas -> exclusive_cache
|
#define HAVE_EX_L2 gotoblas -> exclusive_cache
|
||||||
|
|
||||||
|
#ifdef BUILD_HALF
|
||||||
|
#define SHGEMM_P gotoblas -> shgemm_p
|
||||||
|
#define SHGEMM_Q gotoblas -> shgemm_q
|
||||||
|
#define SHGEMM_R gotoblas -> shgemm_r
|
||||||
|
#define SHGEMM_UNROLL_M gotoblas -> shgemm_unroll_m
|
||||||
|
#define SHGEMM_UNROLL_N gotoblas -> shgemm_unroll_n
|
||||||
|
#define SHGEMM_UNROLL_MN gotoblas -> shgemm_unroll_mn
|
||||||
|
#endif
|
||||||
|
|
||||||
#define SGEMM_P gotoblas -> sgemm_p
|
#define SGEMM_P gotoblas -> sgemm_p
|
||||||
#define SGEMM_Q gotoblas -> sgemm_q
|
#define SGEMM_Q gotoblas -> sgemm_q
|
||||||
#define SGEMM_R gotoblas -> sgemm_r
|
#define SGEMM_R gotoblas -> sgemm_r
|
||||||
|
@ -984,6 +1088,19 @@ extern gotoblas_t *gotoblas;
|
||||||
#define HAVE_EX_L2 0
|
#define HAVE_EX_L2 0
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef BUILD_HALF
|
||||||
|
#define SHGEMM_P SHGEMM_DEFAULT_P
|
||||||
|
#define SHGEMM_Q SHGEMM_DEFAULT_Q
|
||||||
|
#define SHGEMM_R SHGEMM_DEFAULT_R
|
||||||
|
#define SHGEMM_UNROLL_M SHGEMM_DEFAULT_UNROLL_M
|
||||||
|
#define SHGEMM_UNROLL_N SHGEMM_DEFAULT_UNROLL_N
|
||||||
|
#ifdef SHGEMM_DEFAULT_UNROLL_MN
|
||||||
|
#define SHGEMM_UNROLL_MN SHGEMM_DEFAULT_UNROLL_MN
|
||||||
|
#else
|
||||||
|
#define SHGEMM_UNROLL_MN MAX((SHGEMM_UNROLL_M), (SHGEMM_UNROLL_N))
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#define SGEMM_P SGEMM_DEFAULT_P
|
#define SGEMM_P SGEMM_DEFAULT_P
|
||||||
#define SGEMM_Q SGEMM_DEFAULT_Q
|
#define SGEMM_Q SGEMM_DEFAULT_Q
|
||||||
#define SGEMM_R SGEMM_DEFAULT_R
|
#define SGEMM_R SGEMM_DEFAULT_R
|
||||||
|
@ -1119,6 +1236,18 @@ extern gotoblas_t *gotoblas;
|
||||||
#define GEMM_DEFAULT_R DGEMM_DEFAULT_R
|
#define GEMM_DEFAULT_R DGEMM_DEFAULT_R
|
||||||
#define GEMM_DEFAULT_UNROLL_M DGEMM_DEFAULT_UNROLL_M
|
#define GEMM_DEFAULT_UNROLL_M DGEMM_DEFAULT_UNROLL_M
|
||||||
#define GEMM_DEFAULT_UNROLL_N DGEMM_DEFAULT_UNROLL_N
|
#define GEMM_DEFAULT_UNROLL_N DGEMM_DEFAULT_UNROLL_N
|
||||||
|
#elif defined(HALF)
|
||||||
|
#define GEMM_P SHGEMM_P
|
||||||
|
#define GEMM_Q SHGEMM_Q
|
||||||
|
#define GEMM_R SHGEMM_R
|
||||||
|
#define GEMM_UNROLL_M SHGEMM_UNROLL_M
|
||||||
|
#define GEMM_UNROLL_N SHGEMM_UNROLL_N
|
||||||
|
#define GEMM_UNROLL_MN SHGEMM_UNROLL_MN
|
||||||
|
#define GEMM_DEFAULT_P SHGEMM_DEFAULT_P
|
||||||
|
#define GEMM_DEFAULT_Q SHGEMM_DEFAULT_Q
|
||||||
|
#define GEMM_DEFAULT_R SHGEMM_DEFAULT_R
|
||||||
|
#define GEMM_DEFAULT_UNROLL_M SHGEMM_DEFAULT_UNROLL_M
|
||||||
|
#define GEMM_DEFAULT_UNROLL_N SHGEMM_DEFAULT_UNROLL_N
|
||||||
#else
|
#else
|
||||||
#define GEMM_P SGEMM_P
|
#define GEMM_P SGEMM_P
|
||||||
#define GEMM_Q SGEMM_Q
|
#define GEMM_Q SGEMM_Q
|
||||||
|
@ -1204,28 +1333,32 @@ extern gotoblas_t *gotoblas;
|
||||||
#define GEMM_THREAD gemm_thread_n
|
#define GEMM_THREAD gemm_thread_n
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifndef SHGEMM_DEFAULT_R
|
||||||
|
#define SHGEMM_DEFAULT_R (((BUFFER_SIZE - ((SHGEMM_DEFAULT_P * SHGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SHGEMM_DEFAULT_Q * 4) - 15) & ~15UL)
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifndef SGEMM_DEFAULT_R
|
#ifndef SGEMM_DEFAULT_R
|
||||||
#define SGEMM_DEFAULT_R (((BUFFER_SIZE - ((SGEMM_DEFAULT_P * SGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SGEMM_DEFAULT_Q * 4) - 15) & ~15)
|
#define SGEMM_DEFAULT_R (((BUFFER_SIZE - ((SGEMM_DEFAULT_P * SGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SGEMM_DEFAULT_Q * 4) - 15) & ~15UL)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef DGEMM_DEFAULT_R
|
#ifndef DGEMM_DEFAULT_R
|
||||||
#define DGEMM_DEFAULT_R (((BUFFER_SIZE - ((DGEMM_DEFAULT_P * DGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (DGEMM_DEFAULT_Q * 8) - 15) & ~15)
|
#define DGEMM_DEFAULT_R (((BUFFER_SIZE - ((DGEMM_DEFAULT_P * DGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (DGEMM_DEFAULT_Q * 8) - 15) & ~15UL)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef QGEMM_DEFAULT_R
|
#ifndef QGEMM_DEFAULT_R
|
||||||
#define QGEMM_DEFAULT_R (((BUFFER_SIZE - ((QGEMM_DEFAULT_P * QGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (QGEMM_DEFAULT_Q * 16) - 15) & ~15)
|
#define QGEMM_DEFAULT_R (((BUFFER_SIZE - ((QGEMM_DEFAULT_P * QGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (QGEMM_DEFAULT_Q * 16) - 15) & ~15UL)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef CGEMM_DEFAULT_R
|
#ifndef CGEMM_DEFAULT_R
|
||||||
#define CGEMM_DEFAULT_R (((BUFFER_SIZE - ((CGEMM_DEFAULT_P * CGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (CGEMM_DEFAULT_Q * 8) - 15) & ~15)
|
#define CGEMM_DEFAULT_R (((BUFFER_SIZE - ((CGEMM_DEFAULT_P * CGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (CGEMM_DEFAULT_Q * 8) - 15) & ~15UL)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef ZGEMM_DEFAULT_R
|
#ifndef ZGEMM_DEFAULT_R
|
||||||
#define ZGEMM_DEFAULT_R (((BUFFER_SIZE - ((ZGEMM_DEFAULT_P * ZGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (ZGEMM_DEFAULT_Q * 16) - 15) & ~15)
|
#define ZGEMM_DEFAULT_R (((BUFFER_SIZE - ((ZGEMM_DEFAULT_P * ZGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (ZGEMM_DEFAULT_Q * 16) - 15) & ~15UL)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef XGEMM_DEFAULT_R
|
#ifndef XGEMM_DEFAULT_R
|
||||||
#define XGEMM_DEFAULT_R (((BUFFER_SIZE - ((XGEMM_DEFAULT_P * XGEMM_DEFAULT_Q * 32 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (XGEMM_DEFAULT_Q * 32) - 15) & ~15)
|
#define XGEMM_DEFAULT_R (((BUFFER_SIZE - ((XGEMM_DEFAULT_P * XGEMM_DEFAULT_Q * 32 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (XGEMM_DEFAULT_Q * 32) - 15) & ~15UL)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef SNUMOPT
|
#ifndef SNUMOPT
|
||||||
|
|
|
@ -68,7 +68,7 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#if defined(POWER8) || defined(POWER9)
|
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||||
#define MB __asm__ __volatile__ ("eieio":::"memory")
|
#define MB __asm__ __volatile__ ("eieio":::"memory")
|
||||||
#define WMB __asm__ __volatile__ ("eieio":::"memory")
|
#define WMB __asm__ __volatile__ ("eieio":::"memory")
|
||||||
#define RMB __asm__ __volatile__ ("eieio":::"memory")
|
#define RMB __asm__ __volatile__ ("eieio":::"memory")
|
||||||
|
@ -272,7 +272,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
||||||
#define HAVE_PREFETCH
|
#define HAVE_PREFETCH
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || defined(PPC970)
|
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || defined(POWER10) || defined(PPC970)
|
||||||
#define DCBT_ARG 0
|
#define DCBT_ARG 0
|
||||||
#else
|
#else
|
||||||
#define DCBT_ARG 8
|
#define DCBT_ARG 8
|
||||||
|
@ -294,7 +294,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
||||||
#define L1_PREFETCH dcbtst
|
#define L1_PREFETCH dcbtst
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(POWER8) || defined(POWER9)
|
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||||
#define L1_DUALFETCH
|
#define L1_DUALFETCH
|
||||||
#define L1_PREFETCHSIZE (16 + 128 * 100)
|
#define L1_PREFETCHSIZE (16 + 128 * 100)
|
||||||
#define L1_PREFETCH dcbtst
|
#define L1_PREFETCH dcbtst
|
||||||
|
@ -843,7 +843,7 @@ Lmcount$lazy_ptr:
|
||||||
#define BUFFER_SIZE ( 2 << 20)
|
#define BUFFER_SIZE ( 2 << 20)
|
||||||
#elif defined(PPC440FP2)
|
#elif defined(PPC440FP2)
|
||||||
#define BUFFER_SIZE ( 16 << 20)
|
#define BUFFER_SIZE ( 16 << 20)
|
||||||
#elif defined(POWER8) || defined(POWER9)
|
#elif defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||||
#define BUFFER_SIZE ( 64 << 20)
|
#define BUFFER_SIZE ( 64 << 20)
|
||||||
#else
|
#else
|
||||||
#define BUFFER_SIZE ( 16 << 20)
|
#define BUFFER_SIZE ( 16 << 20)
|
||||||
|
|
|
@ -0,0 +1,65 @@
|
||||||
|
#ifndef COMMON_SH_H
|
||||||
|
#define COMMON_SH_H
|
||||||
|
|
||||||
|
#ifndef DYNAMIC_ARCH
|
||||||
|
|
||||||
|
#define SHGEMM_ONCOPY shgemm_oncopy
|
||||||
|
#define SHGEMM_OTCOPY shgemm_otcopy
|
||||||
|
|
||||||
|
#if SHGEMM_DEFAULT_UNROLL_M == SHGEMM_DEFAULT_UNROLL_N
|
||||||
|
#define SHGEMM_INCOPY shgemm_oncopy
|
||||||
|
#define SHGEMM_ITCOPY shgemm_otcopy
|
||||||
|
#else
|
||||||
|
#define SHGEMM_INCOPY shgemm_incopy
|
||||||
|
#define SHGEMM_ITCOPY shgemm_itcopy
|
||||||
|
#endif
|
||||||
|
#define SHGEMM_BETA shgemm_beta
|
||||||
|
#define SHGEMM_KERNEL shgemm_kernel
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#define SHGEMM_ONCOPY gotoblas -> shgemm_oncopy
|
||||||
|
#define SHGEMM_OTCOPY gotoblas -> shgemm_otcopy
|
||||||
|
#define SHGEMM_INCOPY gotoblas -> shgemm_incopy
|
||||||
|
#define SHGEMM_ITCOPY gotoblas -> shgemm_itcopy
|
||||||
|
#define SHGEMM_BETA gotoblas -> shgemm_beta
|
||||||
|
#define SHGEMM_KERNEL gotoblas -> shgemm_kernel
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define SHGEMM_NN shgemm_nn
|
||||||
|
#define SHGEMM_CN shgemm_tn
|
||||||
|
#define SHGEMM_TN shgemm_tn
|
||||||
|
#define SHGEMM_NC shgemm_nt
|
||||||
|
#define SHGEMM_NT shgemm_nt
|
||||||
|
#define SHGEMM_CC shgemm_tt
|
||||||
|
#define SHGEMM_CT shgemm_tt
|
||||||
|
#define SHGEMM_TC shgemm_tt
|
||||||
|
#define SHGEMM_TT shgemm_tt
|
||||||
|
#define SHGEMM_NR shgemm_nn
|
||||||
|
#define SHGEMM_TR shgemm_tn
|
||||||
|
#define SHGEMM_CR shgemm_tn
|
||||||
|
#define SHGEMM_RN shgemm_nn
|
||||||
|
#define SHGEMM_RT shgemm_nt
|
||||||
|
#define SHGEMM_RC shgemm_nt
|
||||||
|
#define SHGEMM_RR shgemm_nn
|
||||||
|
|
||||||
|
#define SHGEMM_THREAD_NN shgemm_thread_nn
|
||||||
|
#define SHGEMM_THREAD_CN shgemm_thread_tn
|
||||||
|
#define SHGEMM_THREAD_TN shgemm_thread_tn
|
||||||
|
#define SHGEMM_THREAD_NC shgemm_thread_nt
|
||||||
|
#define SHGEMM_THREAD_NT shgemm_thread_nt
|
||||||
|
#define SHGEMM_THREAD_CC shgemm_thread_tt
|
||||||
|
#define SHGEMM_THREAD_CT shgemm_thread_tt
|
||||||
|
#define SHGEMM_THREAD_TC shgemm_thread_tt
|
||||||
|
#define SHGEMM_THREAD_TT shgemm_thread_tt
|
||||||
|
#define SHGEMM_THREAD_NR shgemm_thread_nn
|
||||||
|
#define SHGEMM_THREAD_TR shgemm_thread_tn
|
||||||
|
#define SHGEMM_THREAD_CR shgemm_thread_tn
|
||||||
|
#define SHGEMM_THREAD_RN shgemm_thread_nn
|
||||||
|
#define SHGEMM_THREAD_RT shgemm_thread_nt
|
||||||
|
#define SHGEMM_THREAD_RC shgemm_thread_nt
|
||||||
|
#define SHGEMM_THREAD_RR shgemm_thread_nn
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
|
@ -80,7 +80,7 @@ static void __inline blas_lock(volatile BLASULONG *address){
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
do {
|
do {
|
||||||
while (*address) {YIELDING;};
|
while (*address) {YIELDING;}
|
||||||
|
|
||||||
#ifndef C_MSVC
|
#ifndef C_MSVC
|
||||||
__asm__ __volatile__(
|
__asm__ __volatile__(
|
||||||
|
@ -199,9 +199,9 @@ static __inline BLASLONG blas_quickdivide(BLASLONG x, BLASLONG y){
|
||||||
#else
|
#else
|
||||||
extern unsigned int blas_quick_divide_table[];
|
extern unsigned int blas_quick_divide_table[];
|
||||||
|
|
||||||
static __inline int blas_quickdivide(unsigned int x, unsigned int y){
|
static __inline unsigned int blas_quickdivide(unsigned int x, unsigned int y){
|
||||||
|
|
||||||
unsigned int result;
|
volatile unsigned int result;
|
||||||
|
|
||||||
if (y <= 1) return x;
|
if (y <= 1) return x;
|
||||||
|
|
||||||
|
@ -215,7 +215,6 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
|
||||||
y = blas_quick_divide_table[y];
|
y = blas_quick_divide_table[y];
|
||||||
|
|
||||||
__asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x) : "0" (y));
|
__asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x) : "0" (y));
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -5,6 +5,14 @@ inline void pauser(){
|
||||||
std::getline(std::cin, dummy);
|
std::getline(std::cin, dummy);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void FailIfThreadsAreZero(uint32_t numConcurrentThreads) {
|
||||||
|
if(numConcurrentThreads == 0) {
|
||||||
|
std::cout<<"ERROR: Invalid parameter 0 for number of concurrent calls into OpenBLAS!"<<std::endl;
|
||||||
|
std::cout<<"CBLAS DGEMV thread safety test FAILED!"<<std::endl;
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void FillMatrices(std::vector<std::vector<double>>& matBlock, std::mt19937_64& PRNG, std::uniform_real_distribution<double>& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){
|
void FillMatrices(std::vector<std::vector<double>>& matBlock, std::mt19937_64& PRNG, std::uniform_real_distribution<double>& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){
|
||||||
for(uint32_t i=0; i<numMat; i++){
|
for(uint32_t i=0; i<numMat; i++){
|
||||||
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize*randomMatSize); j++){
|
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize*randomMatSize); j++){
|
||||||
|
|
|
@ -47,6 +47,8 @@ int main(int argc, char* argv[]){
|
||||||
std::cout<<"Number of testing rounds : "<<numTestRounds<<'\n';
|
std::cout<<"Number of testing rounds : "<<numTestRounds<<'\n';
|
||||||
std::cout<<"This test will need "<<(static_cast<uint64_t>(randomMatSize*randomMatSize)*numConcurrentThreads*3*8)/static_cast<double>(1024*1024)<<" MiB of RAM\n"<<std::endl;
|
std::cout<<"This test will need "<<(static_cast<uint64_t>(randomMatSize*randomMatSize)*numConcurrentThreads*3*8)/static_cast<double>(1024*1024)<<" MiB of RAM\n"<<std::endl;
|
||||||
|
|
||||||
|
FailIfThreadsAreZero(numConcurrentThreads);
|
||||||
|
|
||||||
std::cout<<"Initializing random number generator..."<<std::flush;
|
std::cout<<"Initializing random number generator..."<<std::flush;
|
||||||
std::mt19937_64 PRNG = InitPRNG();
|
std::mt19937_64 PRNG = InitPRNG();
|
||||||
std::cout<<"done\n";
|
std::cout<<"done\n";
|
||||||
|
|
|
@ -18,7 +18,7 @@ int main(int argc, char* argv[]){
|
||||||
uint32_t maxHwThreads = omp_get_max_threads();
|
uint32_t maxHwThreads = omp_get_max_threads();
|
||||||
|
|
||||||
if (maxHwThreads < 52)
|
if (maxHwThreads < 52)
|
||||||
numConcurrentThreads = maxHwThreads -4;
|
numConcurrentThreads = maxHwThreads;
|
||||||
|
|
||||||
if (argc > 4){
|
if (argc > 4){
|
||||||
std::cout<<"ERROR: too many arguments for thread safety tester"<<std::endl;
|
std::cout<<"ERROR: too many arguments for thread safety tester"<<std::endl;
|
||||||
|
@ -48,6 +48,8 @@ int main(int argc, char* argv[]){
|
||||||
std::cout<<"Number of testing rounds : "<<numTestRounds<<'\n';
|
std::cout<<"Number of testing rounds : "<<numTestRounds<<'\n';
|
||||||
std::cout<<"This test will need "<<((static_cast<uint64_t>(randomMatSize*randomMatSize)*numConcurrentThreads*8)+(static_cast<uint64_t>(randomMatSize)*numConcurrentThreads*8*2))/static_cast<double>(1024*1024)<<" MiB of RAM\n"<<std::endl;
|
std::cout<<"This test will need "<<((static_cast<uint64_t>(randomMatSize*randomMatSize)*numConcurrentThreads*8)+(static_cast<uint64_t>(randomMatSize)*numConcurrentThreads*8*2))/static_cast<double>(1024*1024)<<" MiB of RAM\n"<<std::endl;
|
||||||
|
|
||||||
|
FailIfThreadsAreZero(numConcurrentThreads);
|
||||||
|
|
||||||
std::cout<<"Initializing random number generator..."<<std::flush;
|
std::cout<<"Initializing random number generator..."<<std::flush;
|
||||||
std::mt19937_64 PRNG = InitPRNG();
|
std::mt19937_64 PRNG = InitPRNG();
|
||||||
std::cout<<"done\n";
|
std::cout<<"done\n";
|
||||||
|
|
22
cpuid_mips.c
22
cpuid_mips.c
|
@ -73,11 +73,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define CPU_UNKNOWN 0
|
#define CPU_UNKNOWN 0
|
||||||
#define CPU_P5600 1
|
#define CPU_P5600 1
|
||||||
#define CPU_1004K 2
|
#define CPU_1004K 2
|
||||||
|
#define CPU_24K 3
|
||||||
|
|
||||||
static char *cpuname[] = {
|
static char *cpuname[] = {
|
||||||
"UNKNOWN",
|
"UNKNOWN",
|
||||||
"P5600",
|
"P5600",
|
||||||
"1004K"
|
"MIPS1004K",
|
||||||
|
"MIPS24K"
|
||||||
};
|
};
|
||||||
|
|
||||||
int detect(void){
|
int detect(void){
|
||||||
|
@ -105,6 +107,8 @@ int detect(void){
|
||||||
return CPU_P5600;
|
return CPU_P5600;
|
||||||
} else if (strstr(p, "1004K")) {
|
} else if (strstr(p, "1004K")) {
|
||||||
return CPU_1004K;
|
return CPU_1004K;
|
||||||
|
} else if (strstr(p, " 24K")) {
|
||||||
|
return CPU_24K;
|
||||||
} else
|
} else
|
||||||
return CPU_UNKNOWN;
|
return CPU_UNKNOWN;
|
||||||
}
|
}
|
||||||
|
@ -121,7 +125,7 @@ void get_architecture(void){
|
||||||
}
|
}
|
||||||
|
|
||||||
void get_subarchitecture(void){
|
void get_subarchitecture(void){
|
||||||
if(detect()==CPU_P5600|| detect()==CPU_1004K){
|
if(detect()==CPU_P5600|| detect()==CPU_1004K|| detect()==CPU_24K){
|
||||||
printf("P5600");
|
printf("P5600");
|
||||||
}else{
|
}else{
|
||||||
printf("UNKNOWN");
|
printf("UNKNOWN");
|
||||||
|
@ -146,7 +150,15 @@ void get_cpuconfig(void){
|
||||||
printf("#define MIPS1004K\n");
|
printf("#define MIPS1004K\n");
|
||||||
printf("#define L1_DATA_SIZE 32768\n");
|
printf("#define L1_DATA_SIZE 32768\n");
|
||||||
printf("#define L1_DATA_LINESIZE 32\n");
|
printf("#define L1_DATA_LINESIZE 32\n");
|
||||||
printf("#define L2_SIZE 26144\n");
|
printf("#define L2_SIZE 262144\n");
|
||||||
|
printf("#define DTB_DEFAULT_ENTRIES 8\n");
|
||||||
|
printf("#define DTB_SIZE 4096\n");
|
||||||
|
printf("#define L2_ASSOCIATIVE 4\n");
|
||||||
|
} else if (detect()==CPU_24K) {
|
||||||
|
printf("#define MIPS24K\n");
|
||||||
|
printf("#define L1_DATA_SIZE 32768\n");
|
||||||
|
printf("#define L1_DATA_LINESIZE 32\n");
|
||||||
|
printf("#define L2_SIZE 32768\n");
|
||||||
printf("#define DTB_DEFAULT_ENTRIES 8\n");
|
printf("#define DTB_DEFAULT_ENTRIES 8\n");
|
||||||
printf("#define DTB_SIZE 4096\n");
|
printf("#define DTB_SIZE 4096\n");
|
||||||
printf("#define L2_ASSOCIATIVE 4\n");
|
printf("#define L2_ASSOCIATIVE 4\n");
|
||||||
|
@ -159,7 +171,9 @@ void get_libname(void){
|
||||||
if(detect()==CPU_P5600) {
|
if(detect()==CPU_P5600) {
|
||||||
printf("p5600\n");
|
printf("p5600\n");
|
||||||
} else if (detect()==CPU_1004K) {
|
} else if (detect()==CPU_1004K) {
|
||||||
printf("1004K\n");
|
printf("mips1004K\n");
|
||||||
|
} else if (detect()==CPU_24K) {
|
||||||
|
printf("mips24K\n");
|
||||||
}else{
|
}else{
|
||||||
printf("mips\n");
|
printf("mips\n");
|
||||||
}
|
}
|
||||||
|
|
|
@ -57,6 +57,7 @@
|
||||||
#define CPUTYPE_PPCG4 7
|
#define CPUTYPE_PPCG4 7
|
||||||
#define CPUTYPE_POWER8 8
|
#define CPUTYPE_POWER8 8
|
||||||
#define CPUTYPE_POWER9 9
|
#define CPUTYPE_POWER9 9
|
||||||
|
#define CPUTYPE_POWER10 10
|
||||||
|
|
||||||
char *cpuname[] = {
|
char *cpuname[] = {
|
||||||
"UNKNOWN",
|
"UNKNOWN",
|
||||||
|
@ -68,7 +69,8 @@ char *cpuname[] = {
|
||||||
"CELL",
|
"CELL",
|
||||||
"PPCG4",
|
"PPCG4",
|
||||||
"POWER8",
|
"POWER8",
|
||||||
"POWER9"
|
"POWER9",
|
||||||
|
"POWER10"
|
||||||
};
|
};
|
||||||
|
|
||||||
char *lowercpuname[] = {
|
char *lowercpuname[] = {
|
||||||
|
@ -81,7 +83,8 @@ char *lowercpuname[] = {
|
||||||
"cell",
|
"cell",
|
||||||
"ppcg4",
|
"ppcg4",
|
||||||
"power8",
|
"power8",
|
||||||
"power9"
|
"power9",
|
||||||
|
"power10"
|
||||||
};
|
};
|
||||||
|
|
||||||
char *corename[] = {
|
char *corename[] = {
|
||||||
|
@ -94,7 +97,8 @@ char *corename[] = {
|
||||||
"CELL",
|
"CELL",
|
||||||
"PPCG4",
|
"PPCG4",
|
||||||
"POWER8",
|
"POWER8",
|
||||||
"POWER9"
|
"POWER9",
|
||||||
|
"POWER10"
|
||||||
};
|
};
|
||||||
|
|
||||||
int detect(void){
|
int detect(void){
|
||||||
|
@ -125,6 +129,7 @@ int detect(void){
|
||||||
if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6;
|
if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6;
|
||||||
if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8;
|
if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8;
|
||||||
if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9;
|
if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9;
|
||||||
|
if (!strncasecmp(p, "POWER10", 7)) return CPUTYPE_POWER10;
|
||||||
if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL;
|
if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL;
|
||||||
if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4;
|
if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4;
|
||||||
|
|
||||||
|
@ -157,6 +162,7 @@ int detect(void){
|
||||||
if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6;
|
if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6;
|
||||||
if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8;
|
if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8;
|
||||||
if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9;
|
if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9;
|
||||||
|
if (!strncasecmp(p, "POWER10", 7)) return CPUTYPE_POWER10;
|
||||||
if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL;
|
if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL;
|
||||||
if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4;
|
if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4;
|
||||||
return CPUTYPE_POWER5;
|
return CPUTYPE_POWER5;
|
||||||
|
@ -179,6 +185,9 @@ int detect(void){
|
||||||
int id;
|
int id;
|
||||||
__asm __volatile("mfpvr %0" : "=r"(id));
|
__asm __volatile("mfpvr %0" : "=r"(id));
|
||||||
switch ( id >> 16 ) {
|
switch ( id >> 16 ) {
|
||||||
|
case 0x80: // POWER10
|
||||||
|
return CPUTYPE_POWER10;
|
||||||
|
break;
|
||||||
case 0x4e: // POWER9
|
case 0x4e: // POWER9
|
||||||
return CPUTYPE_POWER9;
|
return CPUTYPE_POWER9;
|
||||||
break;
|
break;
|
||||||
|
|
24
cpuid_x86.c
24
cpuid_x86.c
|
@ -1406,6 +1406,17 @@ int get_cpuname(void){
|
||||||
return CPUTYPE_SANDYBRIDGE;
|
return CPUTYPE_SANDYBRIDGE;
|
||||||
else
|
else
|
||||||
return CPUTYPE_NEHALEM;
|
return CPUTYPE_NEHALEM;
|
||||||
|
}
|
||||||
|
case 10: //family 6 exmodel 10
|
||||||
|
switch (model) {
|
||||||
|
case 5: // Comet Lake H and S
|
||||||
|
case 6: // Comet Lake U
|
||||||
|
if(support_avx2())
|
||||||
|
return CPUTYPE_HASWELL;
|
||||||
|
if(support_avx())
|
||||||
|
return CPUTYPE_SANDYBRIDGE;
|
||||||
|
else
|
||||||
|
return CPUTYPE_NEHALEM;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -1955,6 +1966,19 @@ int get_coretype(void){
|
||||||
return CORE_NEHALEM;
|
return CORE_NEHALEM;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
case 10:
|
||||||
|
switch (model) {
|
||||||
|
case 5: // Comet Lake H and S
|
||||||
|
case 6: // Comet Lake U
|
||||||
|
if(support_avx())
|
||||||
|
#ifndef NO_AVX2
|
||||||
|
return CORE_HASWELL;
|
||||||
|
#else
|
||||||
|
return CORE_SANDYBRIDGE;
|
||||||
|
#endif
|
||||||
|
else
|
||||||
|
return CORE_NEHALEM;
|
||||||
|
}
|
||||||
case 5:
|
case 5:
|
||||||
switch (model) {
|
switch (model) {
|
||||||
case 6:
|
case 6:
|
||||||
|
|
|
@ -12,6 +12,9 @@ FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh
|
||||||
foreach(float_type ${FLOAT_TYPES})
|
foreach(float_type ${FLOAT_TYPES})
|
||||||
string(SUBSTRING ${float_type} 0 1 float_char_upper)
|
string(SUBSTRING ${float_type} 0 1 float_char_upper)
|
||||||
string(TOLOWER ${float_char_upper} float_char)
|
string(TOLOWER ${float_char_upper} float_char)
|
||||||
|
if (${float_char} STREQUAL "h")
|
||||||
|
continue()
|
||||||
|
endif()
|
||||||
#level1
|
#level1
|
||||||
add_executable(x${float_char}cblat1
|
add_executable(x${float_char}cblat1
|
||||||
c_${float_char}blat1.f
|
c_${float_char}blat1.f
|
||||||
|
|
|
@ -19,6 +19,10 @@ ifeq ($(ARCH), MIPS)
|
||||||
USE_GEMM3M = 1
|
USE_GEMM3M = 1
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(BUILD_HALF),1)
|
||||||
|
SHBLASOBJS += shgemm_nn.$(SUFFIX) shgemm_nt.$(SUFFIX) shgemm_tn.$(SUFFIX) shgemm_tt.$(SUFFIX)
|
||||||
|
endif
|
||||||
|
|
||||||
SBLASOBJS += \
|
SBLASOBJS += \
|
||||||
sgemm_nn.$(SUFFIX) sgemm_nt.$(SUFFIX) sgemm_tn.$(SUFFIX) sgemm_tt.$(SUFFIX) \
|
sgemm_nn.$(SUFFIX) sgemm_nt.$(SUFFIX) sgemm_tn.$(SUFFIX) sgemm_tt.$(SUFFIX) \
|
||||||
strmm_LNUU.$(SUFFIX) strmm_LNUN.$(SUFFIX) strmm_LNLU.$(SUFFIX) strmm_LNLN.$(SUFFIX) \
|
strmm_LNUU.$(SUFFIX) strmm_LNUN.$(SUFFIX) strmm_LNLU.$(SUFFIX) strmm_LNLN.$(SUFFIX) \
|
||||||
|
@ -203,7 +207,9 @@ COMMONOBJS += gemm_thread_m.$(SUFFIX) gemm_thread_n.$(SUFFIX) gemm_thread_mn.$(
|
||||||
COMMONOBJS += syrk_thread.$(SUFFIX)
|
COMMONOBJS += syrk_thread.$(SUFFIX)
|
||||||
|
|
||||||
ifndef USE_SIMPLE_THREADED_LEVEL3
|
ifndef USE_SIMPLE_THREADED_LEVEL3
|
||||||
|
ifeq ($(BUILD_HALF),1)
|
||||||
|
SHBLASOBJS += shgemm_thread_nn.$(SUFFIX) shgemm_thread_nt.$(SUFFIX) shgemm_thread_tn.$(SUFFIX) shgemm_thread_tt.$(SUFFIX)
|
||||||
|
endif
|
||||||
SBLASOBJS += sgemm_thread_nn.$(SUFFIX) sgemm_thread_nt.$(SUFFIX) sgemm_thread_tn.$(SUFFIX) sgemm_thread_tt.$(SUFFIX)
|
SBLASOBJS += sgemm_thread_nn.$(SUFFIX) sgemm_thread_nt.$(SUFFIX) sgemm_thread_tn.$(SUFFIX) sgemm_thread_tt.$(SUFFIX)
|
||||||
DBLASOBJS += dgemm_thread_nn.$(SUFFIX) dgemm_thread_nt.$(SUFFIX) dgemm_thread_tn.$(SUFFIX) dgemm_thread_tt.$(SUFFIX)
|
DBLASOBJS += dgemm_thread_nn.$(SUFFIX) dgemm_thread_nt.$(SUFFIX) dgemm_thread_tn.$(SUFFIX) dgemm_thread_tt.$(SUFFIX)
|
||||||
QBLASOBJS += qgemm_thread_nn.$(SUFFIX) qgemm_thread_nt.$(SUFFIX) qgemm_thread_tn.$(SUFFIX) qgemm_thread_tt.$(SUFFIX)
|
QBLASOBJS += qgemm_thread_nn.$(SUFFIX) qgemm_thread_nt.$(SUFFIX) qgemm_thread_tn.$(SUFFIX) qgemm_thread_tt.$(SUFFIX)
|
||||||
|
@ -283,6 +289,18 @@ endif
|
||||||
|
|
||||||
all ::
|
all ::
|
||||||
|
|
||||||
|
shgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||||
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F)
|
||||||
|
|
||||||
|
shgemm_nt.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||||
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F)
|
||||||
|
|
||||||
|
shgemm_tn.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||||
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F)
|
||||||
|
|
||||||
|
shgemm_tt.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||||
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F)
|
||||||
|
|
||||||
sgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h
|
sgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -UCOMPLEX -DNN $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -UCOMPLEX -DNN $< -o $(@F)
|
||||||
|
|
||||||
|
@ -478,6 +496,17 @@ gemm_thread_variable.$(SUFFIX) : gemm_thread_variable.c ../../common.h
|
||||||
beta_thread.$(SUFFIX) : beta_thread.c ../../common.h
|
beta_thread.$(SUFFIX) : beta_thread.c ../../common.h
|
||||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||||
|
|
||||||
|
shgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||||
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F)
|
||||||
|
|
||||||
|
shgemm_thread_nt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||||
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F)
|
||||||
|
|
||||||
|
shgemm_thread_tn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||||
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F)
|
||||||
|
|
||||||
|
shgemm_thread_tt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||||
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F)
|
||||||
|
|
||||||
sgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
sgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F)
|
||||||
|
@ -2652,6 +2681,18 @@ xtrsm_RCLU.$(SUFFIX) : trsm_R.c
|
||||||
xtrsm_RCLN.$(SUFFIX) : trsm_R.c
|
xtrsm_RCLN.$(SUFFIX) : trsm_R.c
|
||||||
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F)
|
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F)
|
||||||
|
|
||||||
|
shgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||||
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F)
|
||||||
|
|
||||||
|
shgemm_nt.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||||
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F)
|
||||||
|
|
||||||
|
shgemm_tn.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||||
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F)
|
||||||
|
|
||||||
|
shgemm_tt.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||||
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F)
|
||||||
|
|
||||||
sgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
sgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -UCOMPLEX -DNN $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -UCOMPLEX -DNN $< -o $(@F)
|
||||||
|
|
||||||
|
@ -2848,6 +2889,18 @@ beta_thread.$(PSUFFIX) : beta_thread.c ../../common.h
|
||||||
$(CC) -c $(PFLAGS) $< -o $(@F)
|
$(CC) -c $(PFLAGS) $< -o $(@F)
|
||||||
|
|
||||||
|
|
||||||
|
shgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||||
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F)
|
||||||
|
|
||||||
|
shgemm_thread_nt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||||
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F)
|
||||||
|
|
||||||
|
shgemm_thread_tn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||||
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F)
|
||||||
|
|
||||||
|
shgemm_thread_tt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||||
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F)
|
||||||
|
|
||||||
sgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
sgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F)
|
||||||
|
|
||||||
|
|
|
@ -62,18 +62,18 @@
|
||||||
#ifndef ICOPY_OPERATION
|
#ifndef ICOPY_OPERATION
|
||||||
#if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \
|
#if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \
|
||||||
defined(RN) || defined(RT) || defined(RC) || defined(RR)
|
defined(RN) || defined(RT) || defined(RC) || defined(RR)
|
||||||
#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
|
#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (IFLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
|
||||||
#else
|
#else
|
||||||
#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
|
#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (IFLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef OCOPY_OPERATION
|
#ifndef OCOPY_OPERATION
|
||||||
#if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \
|
#if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \
|
||||||
defined(NR) || defined(TR) || defined(CR) || defined(RR)
|
defined(NR) || defined(TR) || defined(CR) || defined(RR)
|
||||||
#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
|
#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (IFLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
|
||||||
#else
|
#else
|
||||||
#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
|
#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (IFLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -173,7 +173,8 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
XFLOAT *sa, XFLOAT *sb, BLASLONG dummy){
|
XFLOAT *sa, XFLOAT *sb, BLASLONG dummy){
|
||||||
BLASLONG k, lda, ldb, ldc;
|
BLASLONG k, lda, ldb, ldc;
|
||||||
FLOAT *alpha, *beta;
|
FLOAT *alpha, *beta;
|
||||||
FLOAT *a, *b, *c;
|
IFLOAT *a, *b;
|
||||||
|
FLOAT *c;
|
||||||
BLASLONG m_from, m_to, n_from, n_to;
|
BLASLONG m_from, m_to, n_from, n_to;
|
||||||
|
|
||||||
BLASLONG ls, is, js;
|
BLASLONG ls, is, js;
|
||||||
|
@ -198,8 +199,8 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
|
|
||||||
k = K;
|
k = K;
|
||||||
|
|
||||||
a = (FLOAT *)A;
|
a = (IFLOAT *)A;
|
||||||
b = (FLOAT *)B;
|
b = (IFLOAT *)B;
|
||||||
c = (FLOAT *)C;
|
c = (FLOAT *)C;
|
||||||
|
|
||||||
lda = LDA;
|
lda = LDA;
|
||||||
|
|
|
@ -117,18 +117,18 @@ typedef struct {
|
||||||
#ifndef ICOPY_OPERATION
|
#ifndef ICOPY_OPERATION
|
||||||
#if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \
|
#if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \
|
||||||
defined(RN) || defined(RT) || defined(RC) || defined(RR)
|
defined(RN) || defined(RT) || defined(RC) || defined(RR)
|
||||||
#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
|
#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (IFLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
|
||||||
#else
|
#else
|
||||||
#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
|
#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (IFLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef OCOPY_OPERATION
|
#ifndef OCOPY_OPERATION
|
||||||
#if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \
|
#if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \
|
||||||
defined(NR) || defined(TR) || defined(CR) || defined(RR)
|
defined(NR) || defined(TR) || defined(CR) || defined(RR)
|
||||||
#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
|
#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (IFLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
|
||||||
#else
|
#else
|
||||||
#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
|
#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (IFLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -219,15 +219,16 @@ typedef struct {
|
||||||
#define STOP_RPCC(COUNTER)
|
#define STOP_RPCC(COUNTER)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){
|
static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IFLOAT *sb, BLASLONG mypos){
|
||||||
|
|
||||||
FLOAT *buffer[DIVIDE_RATE];
|
IFLOAT *buffer[DIVIDE_RATE];
|
||||||
|
|
||||||
BLASLONG k, lda, ldb, ldc;
|
BLASLONG k, lda, ldb, ldc;
|
||||||
BLASLONG m_from, m_to, n_from, n_to;
|
BLASLONG m_from, m_to, n_from, n_to;
|
||||||
|
|
||||||
FLOAT *alpha, *beta;
|
FLOAT *alpha, *beta;
|
||||||
FLOAT *a, *b, *c;
|
IFLOAT *a, *b;
|
||||||
|
FLOAT *c;
|
||||||
job_t *job = (job_t *)args -> common;
|
job_t *job = (job_t *)args -> common;
|
||||||
|
|
||||||
BLASLONG nthreads_m;
|
BLASLONG nthreads_m;
|
||||||
|
@ -255,8 +256,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
|
|
||||||
k = K;
|
k = K;
|
||||||
|
|
||||||
a = (FLOAT *)A;
|
a = (IFLOAT *)A;
|
||||||
b = (FLOAT *)B;
|
b = (IFLOAT *)B;
|
||||||
c = (FLOAT *)C;
|
c = (FLOAT *)C;
|
||||||
|
|
||||||
lda = LDA;
|
lda = LDA;
|
||||||
|
@ -425,7 +426,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
/* Apply kernel with local region of A and part of other region of B */
|
/* Apply kernel with local region of A and part of other region of B */
|
||||||
START_RPCC();
|
START_RPCC();
|
||||||
KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - js, div_n), min_l, alpha,
|
KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - js, div_n), min_l, alpha,
|
||||||
sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
|
sa, (IFLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
|
||||||
c, ldc, m_from, js);
|
c, ldc, m_from, js);
|
||||||
STOP_RPCC(kernel);
|
STOP_RPCC(kernel);
|
||||||
|
|
||||||
|
@ -469,7 +470,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
/* Apply kernel with local region of A and part of region of B */
|
/* Apply kernel with local region of A and part of region of B */
|
||||||
START_RPCC();
|
START_RPCC();
|
||||||
KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - js, div_n), min_l, alpha,
|
KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - js, div_n), min_l, alpha,
|
||||||
sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
|
sa, (IFLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
|
||||||
c, ldc, is, js);
|
c, ldc, is, js);
|
||||||
STOP_RPCC(kernel);
|
STOP_RPCC(kernel);
|
||||||
|
|
||||||
|
@ -532,7 +533,7 @@ static int round_up(int remainder, int width, int multiple)
|
||||||
|
|
||||||
|
|
||||||
static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
||||||
*range_n, FLOAT *sa, FLOAT *sb,
|
*range_n, IFLOAT *sa, IFLOAT *sb,
|
||||||
BLASLONG nthreads_m, BLASLONG nthreads_n) {
|
BLASLONG nthreads_m, BLASLONG nthreads_n) {
|
||||||
|
|
||||||
#ifndef USE_OPENMP
|
#ifndef USE_OPENMP
|
||||||
|
@ -728,7 +729,7 @@ EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){
|
int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IFLOAT *sb, BLASLONG mypos){
|
||||||
|
|
||||||
BLASLONG m = args -> m;
|
BLASLONG m = args -> m;
|
||||||
BLASLONG n = args -> n;
|
BLASLONG n = args -> n;
|
||||||
|
|
|
@ -281,6 +281,8 @@ int get_node(void);
|
||||||
static int increased_threads = 0;
|
static int increased_threads = 0;
|
||||||
|
|
||||||
#ifdef OS_LINUX
|
#ifdef OS_LINUX
|
||||||
|
extern int openblas_get_num_threads(void);
|
||||||
|
|
||||||
int openblas_setaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set) {
|
int openblas_setaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set) {
|
||||||
const int active_threads = openblas_get_num_threads();
|
const int active_threads = openblas_get_num_threads();
|
||||||
|
|
||||||
|
@ -602,7 +604,7 @@ int blas_thread_init(void){
|
||||||
if(ret!=0){
|
if(ret!=0){
|
||||||
struct rlimit rlim;
|
struct rlimit rlim;
|
||||||
const char *msg = strerror(ret);
|
const char *msg = strerror(ret);
|
||||||
fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create failed for thread %ld of %ld: %s\n", i+1,blas_num_threads,msg);
|
fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create failed for thread %ld of %d: %s\n", i+1,blas_num_threads,msg);
|
||||||
#ifdef RLIMIT_NPROC
|
#ifdef RLIMIT_NPROC
|
||||||
if(0 == getrlimit(RLIMIT_NPROC, &rlim)) {
|
if(0 == getrlimit(RLIMIT_NPROC, &rlim)) {
|
||||||
fprintf(STDERR, "OpenBLAS blas_thread_init: RLIMIT_NPROC "
|
fprintf(STDERR, "OpenBLAS blas_thread_init: RLIMIT_NPROC "
|
||||||
|
|
|
@ -332,7 +332,7 @@ int support_avx512(){
|
||||||
if((ebx & (1<<7)) == 0){
|
if((ebx & (1<<7)) == 0){
|
||||||
ret=0; //OS does not even support AVX2
|
ret=0; //OS does not even support AVX2
|
||||||
}
|
}
|
||||||
if((ebx & (1<<31)) != 0){
|
if((ebx & (1u<<31)) != 0){
|
||||||
xgetbv(0, &eax, &edx);
|
xgetbv(0, &eax, &edx);
|
||||||
if((eax & 0xe0) == 0xe0)
|
if((eax & 0xe0) == 0xe0)
|
||||||
ret=1; //OS supports AVX512VL
|
ret=1; //OS supports AVX512VL
|
||||||
|
@ -617,6 +617,18 @@ static gotoblas_t *get_coretype(void){
|
||||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
case 10:
|
||||||
|
if (model == 5 || model == 6) {
|
||||||
|
if(support_avx2())
|
||||||
|
return &gotoblas_HASWELL;
|
||||||
|
if(support_avx()) {
|
||||||
|
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||||
|
return &gotoblas_SANDYBRIDGE;
|
||||||
|
} else {
|
||||||
|
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||||
|
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
@ -632,7 +644,7 @@ static gotoblas_t *get_coretype(void){
|
||||||
cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
|
cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
|
||||||
if ( (eax & 0xffff) >= 0x01) {
|
if ( (eax & 0xffff) >= 0x01) {
|
||||||
cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
|
cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
|
||||||
if ((edx & (1 << 30)) == 0 || (edx & (1 << 31)) == 0)
|
if ((edx & (1 << 30)) == 0 || (edx & (1u << 31)) == 0)
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -764,18 +776,53 @@ char *gotoblas_corename(void) {
|
||||||
if (gotoblas == &gotoblas_NORTHWOOD) return corename[ 3];
|
if (gotoblas == &gotoblas_NORTHWOOD) return corename[ 3];
|
||||||
if (gotoblas == &gotoblas_PRESCOTT) return corename[ 4];
|
if (gotoblas == &gotoblas_PRESCOTT) return corename[ 4];
|
||||||
if (gotoblas == &gotoblas_BANIAS) return corename[ 5];
|
if (gotoblas == &gotoblas_BANIAS) return corename[ 5];
|
||||||
if (gotoblas == &gotoblas_ATOM) return corename[ 6];
|
if (gotoblas == &gotoblas_ATOM)
|
||||||
|
#ifdef DYNAMIC_OLDER
|
||||||
|
return corename[ 6];
|
||||||
|
#else
|
||||||
|
return corename[10];
|
||||||
|
#endif
|
||||||
if (gotoblas == &gotoblas_CORE2) return corename[ 7];
|
if (gotoblas == &gotoblas_CORE2) return corename[ 7];
|
||||||
if (gotoblas == &gotoblas_PENRYN) return corename[ 8];
|
if (gotoblas == &gotoblas_PENRYN)
|
||||||
if (gotoblas == &gotoblas_DUNNINGTON) return corename[ 9];
|
#ifdef DYNAMIC_OLDER
|
||||||
|
return corename[ 8];
|
||||||
|
#else
|
||||||
|
return corename[7];
|
||||||
|
#endif
|
||||||
|
if (gotoblas == &gotoblas_DUNNINGTON)
|
||||||
|
#ifdef DYNAMIC_OLDER
|
||||||
|
return corename[ 9];
|
||||||
|
#else
|
||||||
|
return corename[7];
|
||||||
|
#endif
|
||||||
if (gotoblas == &gotoblas_NEHALEM) return corename[10];
|
if (gotoblas == &gotoblas_NEHALEM) return corename[10];
|
||||||
if (gotoblas == &gotoblas_ATHLON) return corename[11];
|
if (gotoblas == &gotoblas_ATHLON) return corename[11];
|
||||||
if (gotoblas == &gotoblas_OPTERON_SSE3) return corename[12];
|
if (gotoblas == &gotoblas_OPTERON_SSE3)
|
||||||
if (gotoblas == &gotoblas_OPTERON) return corename[13];
|
#ifdef DYNAMIC_OLDER
|
||||||
|
return corename[12];
|
||||||
|
#else
|
||||||
|
return corename[7];
|
||||||
|
#endif
|
||||||
|
if (gotoblas == &gotoblas_OPTERON)
|
||||||
|
#ifdef DYNAMIC_OLDER
|
||||||
|
return corename[13];
|
||||||
|
#else
|
||||||
|
return corename[7];
|
||||||
|
#endif
|
||||||
if (gotoblas == &gotoblas_BARCELONA) return corename[14];
|
if (gotoblas == &gotoblas_BARCELONA) return corename[14];
|
||||||
if (gotoblas == &gotoblas_NANO) return corename[15];
|
if (gotoblas == &gotoblas_NANO)
|
||||||
|
#ifdef DYNAMIC_OLDER
|
||||||
|
return corename[15];
|
||||||
|
#else
|
||||||
|
return corename[10];
|
||||||
|
#endif
|
||||||
if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16];
|
if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16];
|
||||||
if (gotoblas == &gotoblas_BOBCAT) return corename[17];
|
if (gotoblas == &gotoblas_BOBCAT)
|
||||||
|
#ifdef DYNAMIC_OLDER
|
||||||
|
return corename[17];
|
||||||
|
#else
|
||||||
|
return corename[7];
|
||||||
|
#endif
|
||||||
if (gotoblas == &gotoblas_BULLDOZER) return corename[18];
|
if (gotoblas == &gotoblas_BULLDOZER) return corename[18];
|
||||||
if (gotoblas == &gotoblas_PILEDRIVER) return corename[19];
|
if (gotoblas == &gotoblas_PILEDRIVER) return corename[19];
|
||||||
if (gotoblas == &gotoblas_HASWELL) return corename[20];
|
if (gotoblas == &gotoblas_HASWELL) return corename[20];
|
||||||
|
@ -787,6 +834,7 @@ char *gotoblas_corename(void) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
static gotoblas_t *force_coretype(char *coretype){
|
static gotoblas_t *force_coretype(char *coretype){
|
||||||
|
|
||||||
int i ;
|
int i ;
|
||||||
|
|
|
@ -6,6 +6,9 @@ extern gotoblas_t gotoblas_POWER8;
|
||||||
#if (!defined __GNUC__) || ( __GNUC__ >= 6)
|
#if (!defined __GNUC__) || ( __GNUC__ >= 6)
|
||||||
extern gotoblas_t gotoblas_POWER9;
|
extern gotoblas_t gotoblas_POWER9;
|
||||||
#endif
|
#endif
|
||||||
|
#if (!defined __GNUC__) || ( __GNUC__ >= 11)
|
||||||
|
extern gotoblas_t gotoblas_POWER10;
|
||||||
|
#endif
|
||||||
|
|
||||||
extern void openblas_warning(int verbose, const char *msg);
|
extern void openblas_warning(int verbose, const char *msg);
|
||||||
|
|
||||||
|
@ -13,7 +16,8 @@ static char *corename[] = {
|
||||||
"unknown",
|
"unknown",
|
||||||
"POWER6",
|
"POWER6",
|
||||||
"POWER8",
|
"POWER8",
|
||||||
"POWER9"
|
"POWER9",
|
||||||
|
"POWER10"
|
||||||
};
|
};
|
||||||
|
|
||||||
#define NUM_CORETYPES 4
|
#define NUM_CORETYPES 4
|
||||||
|
@ -23,6 +27,9 @@ char *gotoblas_corename(void) {
|
||||||
if (gotoblas == &gotoblas_POWER8) return corename[2];
|
if (gotoblas == &gotoblas_POWER8) return corename[2];
|
||||||
#if (!defined __GNUC__) || ( __GNUC__ >= 6)
|
#if (!defined __GNUC__) || ( __GNUC__ >= 6)
|
||||||
if (gotoblas == &gotoblas_POWER9) return corename[3];
|
if (gotoblas == &gotoblas_POWER9) return corename[3];
|
||||||
|
#endif
|
||||||
|
#if (!defined __GNUC__) || ( __GNUC__ >= 11)
|
||||||
|
if (gotoblas == &gotoblas_POWER10) return corename[4];
|
||||||
#endif
|
#endif
|
||||||
return corename[0];
|
return corename[0];
|
||||||
}
|
}
|
||||||
|
@ -36,6 +43,10 @@ static gotoblas_t *get_coretype(void) {
|
||||||
#if (!defined __GNUC__) || ( __GNUC__ >= 6)
|
#if (!defined __GNUC__) || ( __GNUC__ >= 6)
|
||||||
if (__builtin_cpu_is("power9"))
|
if (__builtin_cpu_is("power9"))
|
||||||
return &gotoblas_POWER9;
|
return &gotoblas_POWER9;
|
||||||
|
#endif
|
||||||
|
#if (!defined __GNUC__) || ( __GNUC__ >= 11)
|
||||||
|
if (__builtin_cpu_is("isa_3_1") && __builtin_cpu_supports ("mma"))
|
||||||
|
return &gotoblas_POWER10;
|
||||||
#endif
|
#endif
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
@ -61,6 +72,9 @@ static gotoblas_t *force_coretype(char * coretype) {
|
||||||
case 2: return (&gotoblas_POWER8);
|
case 2: return (&gotoblas_POWER8);
|
||||||
#if (!defined __GNUC__) || ( __GNUC__ >= 6)
|
#if (!defined __GNUC__) || ( __GNUC__ >= 6)
|
||||||
case 3: return (&gotoblas_POWER9);
|
case 3: return (&gotoblas_POWER9);
|
||||||
|
#endif
|
||||||
|
#if (!defined __GNUC__) || ( __GNUC__ >= 11)
|
||||||
|
case 4: return (&gotoblas_POWER10);
|
||||||
#endif
|
#endif
|
||||||
default: return NULL;
|
default: return NULL;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,12 +1,58 @@
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include <stdbool.h>
|
||||||
|
|
||||||
|
// Gate kernels for z13 and z14 on gcc version
|
||||||
|
#if (__GNUC__ == 5 && __GNUC_MINOR__ >= 2) || __GNUC__ >= 6 || \
|
||||||
|
/* RHEL 7 since 7.3: */ \
|
||||||
|
(__GNUC__ == 4 && __GNUC_MINOR__ == 8 && __GNUC_PATCHLEVEL__ == 5 && \
|
||||||
|
__GNUC_RH_RELEASE__ >= 11)
|
||||||
|
#define HAVE_Z13_SUPPORT
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if __GNUC__ >= 7
|
||||||
|
#define HAVE_Z14_SUPPORT
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Guard the use of getauxval() on glibc version >= 2.16
|
||||||
|
#ifdef __GLIBC__
|
||||||
|
#include <features.h>
|
||||||
|
#if __GLIBC_PREREQ(2, 16)
|
||||||
|
#include <sys/auxv.h>
|
||||||
|
#define HAVE_GETAUXVAL 1
|
||||||
|
|
||||||
|
static unsigned long get_hwcap(void)
|
||||||
|
{
|
||||||
|
unsigned long hwcap = getauxval(AT_HWCAP);
|
||||||
|
char *maskenv;
|
||||||
|
|
||||||
|
// honor requests for not using specific CPU features in LD_HWCAP_MASK
|
||||||
|
maskenv = getenv("LD_HWCAP_MASK");
|
||||||
|
if (maskenv)
|
||||||
|
hwcap &= strtoul(maskenv, NULL, 0);
|
||||||
|
|
||||||
|
return hwcap;
|
||||||
|
// note that a missing auxval is interpreted as no capabilities
|
||||||
|
// available, which is safe.
|
||||||
|
}
|
||||||
|
|
||||||
|
#else // __GLIBC_PREREQ(2, 16)
|
||||||
|
#warn "Cannot detect SIMD support in Z13 or newer architectures since glibc is older than 2.16"
|
||||||
|
|
||||||
|
static unsigned long get_hwcap(void) {
|
||||||
|
// treat missing support for getauxval() as no capabilities available,
|
||||||
|
// which is safe.
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
#endif // __GLIBC_PREREQ(2, 16)
|
||||||
|
#endif // __GLIBC
|
||||||
|
|
||||||
|
extern gotoblas_t gotoblas_ZARCH_GENERIC;
|
||||||
|
#ifdef HAVE_Z13_SUPPORT
|
||||||
extern gotoblas_t gotoblas_Z13;
|
extern gotoblas_t gotoblas_Z13;
|
||||||
|
#endif
|
||||||
|
#ifdef HAVE_Z14_SUPPORT
|
||||||
extern gotoblas_t gotoblas_Z14;
|
extern gotoblas_t gotoblas_Z14;
|
||||||
//extern gotoblas_t gotoblas_Z15;
|
#endif
|
||||||
//#if (!defined C_GCC) || (GCC_VERSION >= 60000)
|
|
||||||
//extern gotoblas_t gotoblas_Z14;
|
|
||||||
//#endif
|
|
||||||
|
|
||||||
#define NUM_CORETYPES 4
|
#define NUM_CORETYPES 4
|
||||||
|
|
||||||
|
@ -16,47 +62,50 @@ static char* corename[] = {
|
||||||
"unknown",
|
"unknown",
|
||||||
"Z13",
|
"Z13",
|
||||||
"Z14",
|
"Z14",
|
||||||
// "Z15",
|
|
||||||
"ZARCH_GENERIC",
|
"ZARCH_GENERIC",
|
||||||
};
|
};
|
||||||
|
|
||||||
char* gotoblas_corename(void) {
|
char* gotoblas_corename(void) {
|
||||||
|
#ifdef HAVE_Z13_SUPPORT
|
||||||
if (gotoblas == &gotoblas_Z13) return corename[1];
|
if (gotoblas == &gotoblas_Z13) return corename[1];
|
||||||
|
#endif
|
||||||
|
#ifdef HAVE_Z14_SUPPORT
|
||||||
if (gotoblas == &gotoblas_Z14) return corename[2];
|
if (gotoblas == &gotoblas_Z14) return corename[2];
|
||||||
// if (gotoblas == &gotoblas_Z15) return corename[3];
|
#endif
|
||||||
//#if (!defined C_GCC) || (GCC_VERSION >= 60000)
|
if (gotoblas == &gotoblas_ZARCH_GENERIC) return corename[3];
|
||||||
// if (gotoblas == &gotoblas_POWER9) return corename[3];
|
|
||||||
//#endif
|
return corename[0];
|
||||||
return corename[0]; // try generic?
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// __builtin_cpu_is is not supported by zarch
|
/**
|
||||||
|
* Detect the fitting set of kernels by retrieving the CPU features supported by
|
||||||
|
* OS from the auxiliary value AT_HWCAP and choosing the set of kernels
|
||||||
|
* ("coretype") that exploits most of the features and can be compiled with the
|
||||||
|
* available gcc version.
|
||||||
|
* Note that we cannot use vector registers on a z13 or newer unless supported
|
||||||
|
* by the OS kernel (which needs to handle them properly during context switch).
|
||||||
|
*/
|
||||||
static gotoblas_t* get_coretype(void) {
|
static gotoblas_t* get_coretype(void) {
|
||||||
FILE* infile;
|
|
||||||
char buffer[512], * p;
|
|
||||||
|
|
||||||
p = (char*)NULL;
|
unsigned long hwcap __attribute__((unused)) = get_hwcap();
|
||||||
infile = fopen("/proc/sysinfo", "r");
|
|
||||||
while (fgets(buffer, sizeof(buffer), infile)) {
|
// z14 and z15 systems: exploit Vector Facility (SIMD) and
|
||||||
if (!strncmp("Type", buffer, 4)) {
|
// Vector-Enhancements Facility 1 (float SIMD instructions), if present.
|
||||||
p = strchr(buffer, ':') + 2;
|
#ifdef HAVE_Z14_SUPPORT
|
||||||
#if 0
|
if ((hwcap & HWCAP_S390_VX) && (hwcap & HWCAP_S390_VXE))
|
||||||
fprintf(stderr, "%s\n", p);
|
return &gotoblas_Z14;
|
||||||
#endif
|
#endif
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fclose(infile);
|
// z13: Vector Facility (SIMD for double)
|
||||||
|
#ifdef HAVE_Z13_SUPPORT
|
||||||
|
if (hwcap & HWCAP_S390_VX)
|
||||||
|
return &gotoblas_Z13;
|
||||||
|
#endif
|
||||||
|
|
||||||
if (strstr(p, "2964")) return &gotoblas_Z13;
|
// fallback in case of missing compiler support, systems before z13, or
|
||||||
if (strstr(p, "2965")) return &gotoblas_Z13;
|
// when the OS does not advertise support for the Vector Facility (e.g.,
|
||||||
if (strstr(p, "3906")) return &gotoblas_Z14;
|
// missing support in the OS kernel)
|
||||||
if (strstr(p, "3907")) return &gotoblas_Z14;
|
return &gotoblas_ZARCH_GENERIC;
|
||||||
if (strstr(p, "8561")) return &gotoblas_Z14; // fallback z15 to z14
|
|
||||||
if (strstr(p, "8562")) return &gotoblas_Z14; // fallback z15 to z14
|
|
||||||
|
|
||||||
return NULL; // should be ZARCH_GENERIC
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static gotoblas_t* force_coretype(char* coretype) {
|
static gotoblas_t* force_coretype(char* coretype) {
|
||||||
|
@ -76,12 +125,13 @@ static gotoblas_t* force_coretype(char* coretype) {
|
||||||
|
|
||||||
switch (found)
|
switch (found)
|
||||||
{
|
{
|
||||||
|
#ifdef HAVE_Z13_SUPPORT
|
||||||
case 1: return (&gotoblas_Z13);
|
case 1: return (&gotoblas_Z13);
|
||||||
|
#endif
|
||||||
|
#ifdef HAVE_Z14_SUPPORT
|
||||||
case 2: return (&gotoblas_Z14);
|
case 2: return (&gotoblas_Z14);
|
||||||
// case 3: return (&gotoblas_Z15);
|
#endif
|
||||||
//#if (!defined C_GCC) || (GCC_VERSION >= 60000)
|
case 3: return (&gotoblas_ZARCH_GENERIC);
|
||||||
// case 3: return (&gotoblas_POWER9);
|
|
||||||
//#endif
|
|
||||||
default: return NULL;
|
default: return NULL;
|
||||||
}
|
}
|
||||||
snprintf(message, 128, "Core not found: %s\n", coretype);
|
snprintf(message, 128, "Core not found: %s\n", coretype);
|
||||||
|
@ -109,9 +159,9 @@ void gotoblas_dynamic_init(void) {
|
||||||
|
|
||||||
if (gotoblas == NULL)
|
if (gotoblas == NULL)
|
||||||
{
|
{
|
||||||
snprintf(coremsg, 128, "Falling back to Z14 core\n");
|
snprintf(coremsg, 128, "Failed to detect system, falling back to generic z support.\n");
|
||||||
openblas_warning(1, coremsg);
|
openblas_warning(1, coremsg);
|
||||||
gotoblas = &gotoblas_Z14;
|
gotoblas = &gotoblas_ZARCH_GENERIC;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (gotoblas && gotoblas->init) {
|
if (gotoblas && gotoblas->init) {
|
||||||
|
|
|
@ -2070,7 +2070,7 @@ if (!release->address) return;
|
||||||
if (munmap(release -> address, BUFFER_SIZE)) {
|
if (munmap(release -> address, BUFFER_SIZE)) {
|
||||||
int errsv=errno;
|
int errsv=errno;
|
||||||
perror("OpenBLAS : munmap failed:");
|
perror("OpenBLAS : munmap failed:");
|
||||||
printf("error code=%d,\trelease->address=%lx\n",errsv,release->address);
|
printf("error code=%d,\trelease->address=%p\n",errsv,release->address);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -62,6 +62,11 @@ BLASLONG gemm_offset_b = DEFAULT_GEMM_OFFSET_B;
|
||||||
BLASLONG gemm_offset_b = GEMM_OFFSET_B;
|
BLASLONG gemm_offset_b = GEMM_OFFSET_B;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if SHGEMM_P == shgemm_p
|
||||||
|
BLASLONG shgemm_p = DEFAULT_GEMM_P;
|
||||||
|
#else
|
||||||
|
BLASLONG shgemm_p = SHGEMM_P;
|
||||||
|
#endif
|
||||||
#if SGEMM_P == sgemm_p
|
#if SGEMM_P == sgemm_p
|
||||||
BLASLONG sgemm_p = DEFAULT_GEMM_P;
|
BLASLONG sgemm_p = DEFAULT_GEMM_P;
|
||||||
#else
|
#else
|
||||||
|
@ -83,6 +88,11 @@ BLASLONG zgemm_p = DEFAULT_GEMM_P;
|
||||||
BLASLONG zgemm_p = ZGEMM_P;
|
BLASLONG zgemm_p = ZGEMM_P;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if SHGEMM_Q == shgemm_q
|
||||||
|
BLASLONG shgemm_q = DEFAULT_GEMM_Q;
|
||||||
|
#else
|
||||||
|
BLASLONG shgemm_q = SHGEMM_Q;
|
||||||
|
#endif
|
||||||
#if SGEMM_Q == sgemm_q
|
#if SGEMM_Q == sgemm_q
|
||||||
BLASLONG sgemm_q = DEFAULT_GEMM_Q;
|
BLASLONG sgemm_q = DEFAULT_GEMM_Q;
|
||||||
#else
|
#else
|
||||||
|
@ -104,6 +114,11 @@ BLASLONG zgemm_q = DEFAULT_GEMM_Q;
|
||||||
BLASLONG zgemm_q = ZGEMM_Q;
|
BLASLONG zgemm_q = ZGEMM_Q;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if SHGEMM_R == shgemm_r
|
||||||
|
BLASLONG shgemm_r = DEFAULT_GEMM_R;
|
||||||
|
#else
|
||||||
|
BLASLONG shgemm_r = SHGEMM_R;
|
||||||
|
#endif
|
||||||
#if SGEMM_R == sgemm_r
|
#if SGEMM_R == sgemm_r
|
||||||
BLASLONG sgemm_r = DEFAULT_GEMM_R;
|
BLASLONG sgemm_r = DEFAULT_GEMM_R;
|
||||||
#else
|
#else
|
||||||
|
@ -597,6 +612,7 @@ void blas_set_parameter(void){
|
||||||
|
|
||||||
size = BITMASK(cpuid3, 16, 0xff);
|
size = BITMASK(cpuid3, 16, 0xff);
|
||||||
|
|
||||||
|
shgemm_p = 192 * (size + 1);
|
||||||
sgemm_p = 192 * (size + 1);
|
sgemm_p = 192 * (size + 1);
|
||||||
dgemm_p = 96 * (size + 1);
|
dgemm_p = 96 * (size + 1);
|
||||||
cgemm_p = 96 * (size + 1);
|
cgemm_p = 96 * (size + 1);
|
||||||
|
@ -610,6 +626,7 @@ void blas_set_parameter(void){
|
||||||
xgemm_p = 16 * (size + 1);
|
xgemm_p = 16 * (size + 1);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
shgemm_r = (((BUFFER_SIZE - ((SHGEMM_P * SHGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SHGEMM_Q * 4)) - 15) & ~15;
|
||||||
sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15;
|
sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15;
|
||||||
dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15;
|
dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15;
|
||||||
cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15;
|
cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15;
|
||||||
|
|
|
@ -30,6 +30,10 @@ ifndef BUILD_LAPACK_DEPRECATED
|
||||||
BUILD_LAPACK_DEPRECATED = 0
|
BUILD_LAPACK_DEPRECATED = 0
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifndef BUILD_HALF
|
||||||
|
BUILD_HALF = 0
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq ($(OSNAME), WINNT)
|
ifeq ($(OSNAME), WINNT)
|
||||||
ifeq ($(F_COMPILER), GFORTRAN)
|
ifeq ($(F_COMPILER), GFORTRAN)
|
||||||
ifndef ONLY_CBLAS
|
ifndef ONLY_CBLAS
|
||||||
|
@ -151,8 +155,12 @@ ifeq ($(F_COMPILER), INTEL)
|
||||||
-Wl,--whole-archive $< -Wl,--no-whole-archive \
|
-Wl,--whole-archive $< -Wl,--no-whole-archive \
|
||||||
-Wl,-soname,$(INTERNALNAME) $(EXTRALIB)
|
-Wl,-soname,$(INTERNALNAME) $(EXTRALIB)
|
||||||
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
|
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
|
||||||
|
else ifeq ($(F_COMPILER), FLANG)
|
||||||
|
$(FC) $(FFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
|
||||||
|
-Wl,--whole-archive $< -Wl,--no-whole-archive \
|
||||||
|
-Wl,-soname,$(INTERNALNAME) $(EXTRALIB)
|
||||||
|
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
|
||||||
else
|
else
|
||||||
|
|
||||||
ifneq ($(C_COMPILER), LSB)
|
ifneq ($(C_COMPILER), LSB)
|
||||||
$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
|
$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
|
||||||
-Wl,--whole-archive $< -Wl,--no-whole-archive \
|
-Wl,--whole-archive $< -Wl,--no-whole-archive \
|
||||||
|
@ -234,23 +242,23 @@ static : ../$(LIBNAME)
|
||||||
rm -f goto.$(SUFFIX)
|
rm -f goto.$(SUFFIX)
|
||||||
|
|
||||||
osx.def : gensymbol ../Makefile.system ../getarch.c
|
osx.def : gensymbol ../Makefile.system ../getarch.c
|
||||||
perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
|
perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F)
|
||||||
|
|
||||||
aix.def : gensymbol ../Makefile.system ../getarch.c
|
aix.def : gensymbol ../Makefile.system ../getarch.c
|
||||||
perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
|
perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F)
|
||||||
|
|
||||||
objcopy.def : gensymbol ../Makefile.system ../getarch.c
|
objcopy.def : gensymbol ../Makefile.system ../getarch.c
|
||||||
perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
|
perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F)
|
||||||
|
|
||||||
objconv.def : gensymbol ../Makefile.system ../getarch.c
|
objconv.def : gensymbol ../Makefile.system ../getarch.c
|
||||||
perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
|
perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F)
|
||||||
|
|
||||||
test : linktest.c
|
test : linktest.c
|
||||||
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK.
|
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK.
|
||||||
rm -f linktest
|
rm -f linktest
|
||||||
|
|
||||||
linktest.c : gensymbol ../Makefile.system ../getarch.c
|
linktest.c : gensymbol ../Makefile.system ../getarch.c
|
||||||
perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > linktest.c
|
perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > linktest.c
|
||||||
|
|
||||||
clean ::
|
clean ::
|
||||||
@rm -f *.def *.dylib __.SYMDEF* *.renamed
|
@rm -f *.def *.dylib __.SYMDEF* *.renamed
|
||||||
|
|
|
@ -40,17 +40,13 @@
|
||||||
ztbsv,ztpmv,ztpsv,ztrmm,ztrmv,ztrsm,ztrsv,
|
ztbsv,ztpmv,ztpsv,ztrmm,ztrmv,ztrsm,ztrsv,
|
||||||
xerbla,
|
xerbla,
|
||||||
saxpby,daxpby,caxpby,zaxpby,
|
saxpby,daxpby,caxpby,zaxpby,
|
||||||
|
somatcopy, domatcopy, comatcopy, zomatcopy,
|
||||||
|
simatcopy, dimatcopy, cimatcopy, zimatcopy,
|
||||||
sgeadd,dgeadd,cgeadd,zgeadd,
|
sgeadd,dgeadd,cgeadd,zgeadd,
|
||||||
somatcopy,
|
ssum, dsum, scsum, dzsum
|
||||||
simatcopy,
|
|
||||||
domatcopy,
|
|
||||||
dimatcopy,
|
|
||||||
comatcopy,
|
|
||||||
cimatcopy,
|
|
||||||
zomatcopy,
|
|
||||||
zimatcopy,
|
|
||||||
);
|
);
|
||||||
|
|
||||||
|
@halfblasobjs = (shgemm);
|
||||||
@cblasobjs = (
|
@cblasobjs = (
|
||||||
cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv,
|
cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv,
|
||||||
cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k,
|
cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k,
|
||||||
|
@ -80,9 +76,16 @@
|
||||||
cblas_saxpby,cblas_daxpby,cblas_caxpby,cblas_zaxpby,
|
cblas_saxpby,cblas_daxpby,cblas_caxpby,cblas_zaxpby,
|
||||||
cblas_somatcopy, cblas_domatcopy, cblas_comatcopy, cblas_zomatcopy,
|
cblas_somatcopy, cblas_domatcopy, cblas_comatcopy, cblas_zomatcopy,
|
||||||
cblas_simatcopy, cblas_dimatcopy, cblas_cimatcopy, cblas_zimatcopy,
|
cblas_simatcopy, cblas_dimatcopy, cblas_cimatcopy, cblas_zimatcopy,
|
||||||
cblas_sgeadd, cblas_dgeadd,cblas_cgeadd, cblas_zgeadd
|
cblas_sgeadd, cblas_dgeadd,cblas_cgeadd, cblas_zgeadd,
|
||||||
|
cblas_isamin, cblas_idamin, cblas_icamin, cblas_izamin,
|
||||||
|
cblas_ismin, cblas_idmin, cblas_icmin, cblas_izmin,
|
||||||
|
cblas_ismax, cblas_idmax, cblas_icmax, cblas_izmax,
|
||||||
|
cblas_ssum, cblas_dsum, cblas_scsum, cblas_dzsum,
|
||||||
|
cblas_xerbla
|
||||||
);
|
);
|
||||||
|
|
||||||
|
@halfcblasobjs = (cblas_shgemm);
|
||||||
|
|
||||||
@exblasobjs = (
|
@exblasobjs = (
|
||||||
qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm,
|
qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm,
|
||||||
qgemv,qger,qmax,qmin,
|
qgemv,qger,qmax,qmin,
|
||||||
|
@ -3454,6 +3457,10 @@ use File::Spec;
|
||||||
use File::Basename;
|
use File::Basename;
|
||||||
my $dirname = File::Spec->catfile(dirname(dirname(File::Spec->rel2abs(__FILE__))), "lapack-netlib");
|
my $dirname = File::Spec->catfile(dirname(dirname(File::Spec->rel2abs(__FILE__))), "lapack-netlib");
|
||||||
|
|
||||||
|
if ($ARGV[12] == 1) {
|
||||||
|
@blasobjs = (@blasobjs, @halfblasobjs);
|
||||||
|
@cblasobjs = (@cblasobjs, @halfcblasobjs);
|
||||||
|
}
|
||||||
if ($ARGV[8] == 1) {
|
if ($ARGV[8] == 1) {
|
||||||
#ONLY_CBLAS=1
|
#ONLY_CBLAS=1
|
||||||
@underscore_objs = (@misc_underscore_objs);
|
@underscore_objs = (@misc_underscore_objs);
|
||||||
|
@ -3494,9 +3501,12 @@ if ($ARGV[1] eq "x86") { @underscore_objs = (@underscore_objs, @gemm3mobjs);
|
||||||
if ($ARGV[1] eq "ia64") { @underscore_objs = (@underscore_objs, @gemm3mobjs); };
|
if ($ARGV[1] eq "ia64") { @underscore_objs = (@underscore_objs, @gemm3mobjs); };
|
||||||
if ($ARGV[1] eq "MIPS") { @underscore_objs = (@underscore_objs, @gemm3mobjs); };
|
if ($ARGV[1] eq "MIPS") { @underscore_objs = (@underscore_objs, @gemm3mobjs); };
|
||||||
|
|
||||||
|
|
||||||
if ($ARGV[4] == 0) {
|
if ($ARGV[4] == 0) {
|
||||||
@no_underscore_objs = (@cblasobjs, @misc_no_underscore_objs);
|
@no_underscore_objs = (@cblasobjs, @misc_no_underscore_objs);
|
||||||
|
if ($ARGV[1] eq "x86_64") { @no_underscore_objs = (@no_underscore_objs, @cblasgemm3mobjs); };
|
||||||
|
if ($ARGV[1] eq "x86") { @no_underscore_objs = (@no_underscore_objs, @cblasgemm3mobjs); };
|
||||||
|
if ($ARGV[1] eq "ia64") { @no_underscore_objs = (@no_underscore_objs, @cblasgemm3mobjs); };
|
||||||
|
if ($ARGV[1] eq "MIPS") { @no_underscore_objs = (@no_underscore_objs, @cblasgemm3mobjs); };
|
||||||
}else{
|
}else{
|
||||||
#NO_CBLAS=1
|
#NO_CBLAS=1
|
||||||
@no_underscore_objs = (@misc_no_underscore_objs);
|
@no_underscore_objs = (@misc_no_underscore_objs);
|
||||||
|
|
3
f_check
3
f_check
|
@ -334,7 +334,8 @@ if ($link ne "") {
|
||||||
&& ($flags !~ /kernel32/)
|
&& ($flags !~ /kernel32/)
|
||||||
&& ($flags !~ /advapi32/)
|
&& ($flags !~ /advapi32/)
|
||||||
&& ($flags !~ /shell32/)
|
&& ($flags !~ /shell32/)
|
||||||
&& ($flags !~ /omp/)
|
&& ($flags !~ /omp/ || ($vendor !~ /PGI/ && $flags =~ /omp/))
|
||||||
|
&& ($flags !~ /[0-9]+/)
|
||||||
&& ($flags !~ /^\-l$/)
|
&& ($flags !~ /^\-l$/)
|
||||||
) {
|
) {
|
||||||
$linker_l .= $flags . " ";
|
$linker_l .= $flags . " ";
|
||||||
|
|
47
getarch.c
47
getarch.c
|
@ -650,6 +650,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define CORENAME "POWER9"
|
#define CORENAME "POWER9"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(FORCE_POWER10)
|
||||||
|
#define FORCE
|
||||||
|
#define ARCHITECTURE "POWER"
|
||||||
|
#define SUBARCHITECTURE "POWER10"
|
||||||
|
#define SUBDIRNAME "power"
|
||||||
|
#define ARCHCONFIG "-DPOWER10 " \
|
||||||
|
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \
|
||||||
|
"-DL2_SIZE=4194304 -DL2_LINESIZE=128 " \
|
||||||
|
"-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
|
||||||
|
#define LIBNAME "power10"
|
||||||
|
#define CORENAME "POWER10"
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef FORCE_PPCG4
|
#ifdef FORCE_PPCG4
|
||||||
#define FORCE
|
#define FORCE
|
||||||
#define ARCHITECTURE "POWER"
|
#define ARCHITECTURE "POWER"
|
||||||
|
@ -812,6 +825,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#else
|
#else
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef FORCE_MIPS1004K
|
||||||
|
#define FORCE
|
||||||
|
#define ARCHITECTURE "MIPS"
|
||||||
|
#define SUBARCHITECTURE "MIPS1004K"
|
||||||
|
#define SUBDIRNAME "mips"
|
||||||
|
#define ARCHCONFIG "-DMIPS1004K " \
|
||||||
|
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \
|
||||||
|
"-DL2_SIZE=262144 -DL2_LINESIZE=32 " \
|
||||||
|
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
|
||||||
|
#define LIBNAME "mips1004K"
|
||||||
|
#define CORENAME "MIPS1004K"
|
||||||
|
#else
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef FORCE_MIPS24K
|
||||||
|
#define FORCE
|
||||||
|
#define ARCHITECTURE "MIPS"
|
||||||
|
#define SUBARCHITECTURE "MIPS24K"
|
||||||
|
#define SUBDIRNAME "mips"
|
||||||
|
#define ARCHCONFIG "-DMIPS24K " \
|
||||||
|
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \
|
||||||
|
"-DL2_SIZE=32768 -DL2_LINESIZE=32 " \
|
||||||
|
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
|
||||||
|
#define LIBNAME "mips24K"
|
||||||
|
#define CORENAME "MIPS24K"
|
||||||
|
#else
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef FORCE_I6500
|
#ifdef FORCE_I6500
|
||||||
#define FORCE
|
#define FORCE
|
||||||
#define ARCHITECTURE "MIPS"
|
#define ARCHITECTURE "MIPS"
|
||||||
|
@ -1334,10 +1375,12 @@ int main(int argc, char *argv[]){
|
||||||
|
|
||||||
#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||||
printf("__BYTE_ORDER__=__ORDER_BIG_ENDIAN__\n");
|
printf("__BYTE_ORDER__=__ORDER_BIG_ENDIAN__\n");
|
||||||
#endif
|
#elif defined(__BIG_ENDIAN__) && __BIG_ENDIAN__ > 0
|
||||||
#if defined(__BIG_ENDIAN__) && __BIG_ENDIAN__ > 0
|
|
||||||
printf("__BYTE_ORDER__=__ORDER_BIG_ENDIAN__\n");
|
printf("__BYTE_ORDER__=__ORDER_BIG_ENDIAN__\n");
|
||||||
#endif
|
#endif
|
||||||
|
#if defined(_CALL_ELF) && (_CALL_ELF == 2)
|
||||||
|
printf("ELF_VERSION=2\n");
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef MAKE_NB_JOBS
|
#ifdef MAKE_NB_JOBS
|
||||||
#if MAKE_NB_JOBS > 0
|
#if MAKE_NB_JOBS > 0
|
||||||
|
|
|
@ -9,6 +9,8 @@
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv) {
|
||||||
|
|
||||||
if ( (argc <= 1) || ((argc >= 2) && (*argv[1] == '0'))) {
|
if ( (argc <= 1) || ((argc >= 2) && (*argv[1] == '0'))) {
|
||||||
|
printf("SHGEMM_UNROLL_M=%d\n", SHGEMM_DEFAULT_UNROLL_M);
|
||||||
|
printf("SHGEMM_UNROLL_N=%d\n", SHGEMM_DEFAULT_UNROLL_N);
|
||||||
printf("SGEMM_UNROLL_M=%d\n", SGEMM_DEFAULT_UNROLL_M);
|
printf("SGEMM_UNROLL_M=%d\n", SGEMM_DEFAULT_UNROLL_M);
|
||||||
printf("SGEMM_UNROLL_N=%d\n", SGEMM_DEFAULT_UNROLL_N);
|
printf("SGEMM_UNROLL_N=%d\n", SGEMM_DEFAULT_UNROLL_N);
|
||||||
printf("DGEMM_UNROLL_M=%d\n", DGEMM_DEFAULT_UNROLL_M);
|
printf("DGEMM_UNROLL_M=%d\n", DGEMM_DEFAULT_UNROLL_M);
|
||||||
|
|
|
@ -115,7 +115,7 @@ foreach (float_type ${FLOAT_TYPES})
|
||||||
GenerateNamedObjects("syr2k.c" "HEMM" "her2k" ${CBLAS_FLAG} "" "" false ${float_type})
|
GenerateNamedObjects("syr2k.c" "HEMM" "her2k" ${CBLAS_FLAG} "" "" false ${float_type})
|
||||||
|
|
||||||
if (USE_GEMM3M)
|
if (USE_GEMM3M)
|
||||||
GenerateNamedObjects("gemm.c" "GEMM3M" "gemm3m" false "" "" false ${float_type})
|
GenerateNamedObjects("gemm.c" "GEMM3M" "gemm3m" ${CBLAS_FLAG} "" "" false ${float_type})
|
||||||
endif()
|
endif()
|
||||||
endif ()
|
endif ()
|
||||||
if (${float_type} STREQUAL "COMPLEX")
|
if (${float_type} STREQUAL "COMPLEX")
|
||||||
|
|
|
@ -46,6 +46,9 @@ SBLAS3OBJS = \
|
||||||
somatcopy.$(SUFFIX) simatcopy.$(SUFFIX)\
|
somatcopy.$(SUFFIX) simatcopy.$(SUFFIX)\
|
||||||
sgeadd.$(SUFFIX)
|
sgeadd.$(SUFFIX)
|
||||||
|
|
||||||
|
ifeq ($(BUILD_HALF),1)
|
||||||
|
SHBLAS3OBJS = shgemm.$(SUFFIX)
|
||||||
|
endif
|
||||||
|
|
||||||
DBLAS1OBJS = \
|
DBLAS1OBJS = \
|
||||||
daxpy.$(SUFFIX) dswap.$(SUFFIX) \
|
daxpy.$(SUFFIX) dswap.$(SUFFIX) \
|
||||||
|
@ -277,6 +280,10 @@ CSBLAS3OBJS = \
|
||||||
cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX)\
|
cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX)\
|
||||||
cblas_sgeadd.$(SUFFIX)
|
cblas_sgeadd.$(SUFFIX)
|
||||||
|
|
||||||
|
ifeq ($(BUILD_HALF),1)
|
||||||
|
CSHBLAS3OBJS = cblas_shgemm.$(SUFFIX)
|
||||||
|
endif
|
||||||
|
|
||||||
CDBLAS1OBJS = \
|
CDBLAS1OBJS = \
|
||||||
cblas_idamax.$(SUFFIX) cblas_idamin.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \
|
cblas_idamax.$(SUFFIX) cblas_idamin.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \
|
||||||
cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \
|
cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \
|
||||||
|
@ -367,6 +374,7 @@ override CFLAGS += -I.
|
||||||
SBLAS1OBJS += $(CSBLAS1OBJS)
|
SBLAS1OBJS += $(CSBLAS1OBJS)
|
||||||
SBLAS2OBJS += $(CSBLAS2OBJS)
|
SBLAS2OBJS += $(CSBLAS2OBJS)
|
||||||
SBLAS3OBJS += $(CSBLAS3OBJS)
|
SBLAS3OBJS += $(CSBLAS3OBJS)
|
||||||
|
SHBLAS3OBJS += $(CSHBLAS3OBJS)
|
||||||
DBLAS1OBJS += $(CDBLAS1OBJS)
|
DBLAS1OBJS += $(CDBLAS1OBJS)
|
||||||
DBLAS2OBJS += $(CDBLAS2OBJS)
|
DBLAS2OBJS += $(CDBLAS2OBJS)
|
||||||
DBLAS3OBJS += $(CDBLAS3OBJS)
|
DBLAS3OBJS += $(CDBLAS3OBJS)
|
||||||
|
@ -380,6 +388,7 @@ ZBLAS3OBJS += $(CZBLAS3OBJS)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
SBLASOBJS = $(SBLAS1OBJS) $(SBLAS2OBJS) $(SBLAS3OBJS)
|
SBLASOBJS = $(SBLAS1OBJS) $(SBLAS2OBJS) $(SBLAS3OBJS)
|
||||||
|
SHBLASOBJS = $(SHBLAS3OBJS)
|
||||||
DBLASOBJS = $(DBLAS1OBJS) $(DBLAS2OBJS) $(DBLAS3OBJS)
|
DBLASOBJS = $(DBLAS1OBJS) $(DBLAS2OBJS) $(DBLAS3OBJS)
|
||||||
QBLASOBJS = $(QBLAS1OBJS) $(QBLAS2OBJS) $(QBLAS3OBJS)
|
QBLASOBJS = $(QBLAS1OBJS) $(QBLAS2OBJS) $(QBLAS3OBJS)
|
||||||
CBLASOBJS = $(CBLAS1OBJS) $(CBLAS2OBJS) $(CBLAS3OBJS)
|
CBLASOBJS = $(CBLAS1OBJS) $(CBLAS2OBJS) $(CBLAS3OBJS)
|
||||||
|
@ -454,7 +463,7 @@ ZBLASOBJS += $(ZLAPACKOBJS)
|
||||||
|
|
||||||
endif
|
endif
|
||||||
|
|
||||||
FUNCOBJS = $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS)
|
FUNCOBJS = $(SHBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS)
|
||||||
|
|
||||||
ifdef EXPRECISION
|
ifdef EXPRECISION
|
||||||
FUNCOBJS += $(QBLASOBJS) $(XBLASOBJS)
|
FUNCOBJS += $(QBLASOBJS) $(XBLASOBJS)
|
||||||
|
@ -488,10 +497,10 @@ level1 : $(SBLAS1OBJS) $(DBLAS1OBJS) $(QBLAS1OBJS) $(CBLAS1OBJS) $(ZBLAS1OBJS) $
|
||||||
level2 : $(SBLAS2OBJS) $(DBLAS2OBJS) $(QBLAS2OBJS) $(CBLAS2OBJS) $(ZBLAS2OBJS) $(XBLAS2OBJS)
|
level2 : $(SBLAS2OBJS) $(DBLAS2OBJS) $(QBLAS2OBJS) $(CBLAS2OBJS) $(ZBLAS2OBJS) $(XBLAS2OBJS)
|
||||||
$(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^
|
$(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^
|
||||||
|
|
||||||
level3 : $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $(XBLAS3OBJS)
|
level3 : $(SHBLAS3OBJS) $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $(XBLAS3OBJS)
|
||||||
$(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^
|
$(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^
|
||||||
|
|
||||||
$(CSBLASOBJS) $(CSBLASOBJS_P) $(CDBLASOBJS) $(CDBLASOBJS_P) $(CQBLASOBJS) $(CQBLASOBJS_P) \
|
$(CSHBLASOBJS) $(CSHBLASOBJS_P) $(CSBLASOBJS) $(CSBLASOBJS_P) $(CDBLASOBJS) $(CDBLASOBJS_P) $(CQBLASOBJS) $(CQBLASOBJS_P) \
|
||||||
$(CCBLASOBJS) $(CCBLASOBJS_P) $(CZBLASOBJS) $(CZBLASOBJS_P) $(CXBLASOBJS) $(CXBLASOBJS_P) : override CFLAGS += -DCBLAS
|
$(CCBLASOBJS) $(CCBLASOBJS_P) $(CZBLASOBJS) $(CZBLASOBJS_P) $(CXBLASOBJS) $(CXBLASOBJS_P) : override CFLAGS += -DCBLAS
|
||||||
|
|
||||||
srot.$(SUFFIX) srot.$(PSUFFIX) : rot.c
|
srot.$(SUFFIX) srot.$(PSUFFIX) : rot.c
|
||||||
|
@ -1209,6 +1218,11 @@ zhpr2.$(SUFFIX) zhpr2.$(PSUFFIX) : zhpr2.c
|
||||||
xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c
|
xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c
|
||||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||||
|
|
||||||
|
ifeq ($(BUILD_HALF),1)
|
||||||
|
shgemm.$(SUFFIX) shgemm.$(PSUFFIX) : gemm.c ../param.h
|
||||||
|
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||||
|
endif
|
||||||
|
|
||||||
sgemm.$(SUFFIX) sgemm.$(PSUFFIX) : gemm.c ../param.h
|
sgemm.$(SUFFIX) sgemm.$(PSUFFIX) : gemm.c ../param.h
|
||||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||||
|
|
||||||
|
@ -1770,6 +1784,11 @@ cblas_zhemv.$(SUFFIX) cblas_zhemv.$(PSUFFIX) : zhemv.c
|
||||||
cblas_sgemm.$(SUFFIX) cblas_sgemm.$(PSUFFIX) : gemm.c ../param.h
|
cblas_sgemm.$(SUFFIX) cblas_sgemm.$(PSUFFIX) : gemm.c ../param.h
|
||||||
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
|
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
|
||||||
|
|
||||||
|
ifeq ($(BUILD_HALF),1)
|
||||||
|
cblas_shgemm.$(SUFFIX) cblas_shgemm.$(PSUFFIX) : gemm.c ../param.h
|
||||||
|
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
|
||||||
|
endif
|
||||||
|
|
||||||
cblas_dgemm.$(SUFFIX) cblas_dgemm.$(PSUFFIX) : gemm.c ../param.h
|
cblas_dgemm.$(SUFFIX) cblas_dgemm.$(PSUFFIX) : gemm.c ../param.h
|
||||||
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
|
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
|
||||||
|
|
||||||
|
|
|
@ -77,7 +77,7 @@
|
||||||
#define GEMM_MULTITHREAD_THRESHOLD 4
|
#define GEMM_MULTITHREAD_THRESHOLD 4
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = {
|
static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, IFLOAT *, IFLOAT *, BLASLONG) = {
|
||||||
#ifndef GEMM3M
|
#ifndef GEMM3M
|
||||||
GEMM_NN, GEMM_TN, GEMM_RN, GEMM_CN,
|
GEMM_NN, GEMM_TN, GEMM_RN, GEMM_CN,
|
||||||
GEMM_NT, GEMM_TT, GEMM_RT, GEMM_CT,
|
GEMM_NT, GEMM_TT, GEMM_RT, GEMM_CT,
|
||||||
|
@ -108,8 +108,8 @@ static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLA
|
||||||
void NAME(char *TRANSA, char *TRANSB,
|
void NAME(char *TRANSA, char *TRANSB,
|
||||||
blasint *M, blasint *N, blasint *K,
|
blasint *M, blasint *N, blasint *K,
|
||||||
FLOAT *alpha,
|
FLOAT *alpha,
|
||||||
FLOAT *a, blasint *ldA,
|
IFLOAT *a, blasint *ldA,
|
||||||
FLOAT *b, blasint *ldB,
|
IFLOAT *b, blasint *ldB,
|
||||||
FLOAT *beta,
|
FLOAT *beta,
|
||||||
FLOAT *c, blasint *ldC){
|
FLOAT *c, blasint *ldC){
|
||||||
|
|
||||||
|
@ -119,8 +119,8 @@ void NAME(char *TRANSA, char *TRANSB,
|
||||||
blasint info;
|
blasint info;
|
||||||
|
|
||||||
char transA, transB;
|
char transA, transB;
|
||||||
FLOAT *buffer;
|
IFLOAT *buffer;
|
||||||
FLOAT *sa, *sb;
|
IFLOAT *sa, *sb;
|
||||||
|
|
||||||
#ifdef SMP
|
#ifdef SMP
|
||||||
double MNK;
|
double MNK;
|
||||||
|
|
|
@ -41,6 +41,9 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
||||||
foreach (float_type ${FLOAT_TYPES})
|
foreach (float_type ${FLOAT_TYPES})
|
||||||
# a bit of metaprogramming here to pull out the appropriate KERNEL var
|
# a bit of metaprogramming here to pull out the appropriate KERNEL var
|
||||||
string(SUBSTRING ${float_type} 0 1 float_char)
|
string(SUBSTRING ${float_type} 0 1 float_char)
|
||||||
|
if (${float_type} STREQUAL "HALF")
|
||||||
|
set (float_char "SH")
|
||||||
|
endif ()
|
||||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}AMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false ${float_type})
|
GenerateNamedObjects("${KERNELDIR}/${${float_char}AMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false ${float_type})
|
||||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}AMINKERNEL}" "USE_ABS;USE_MIN" "amin_k" false "" "" false ${float_type})
|
GenerateNamedObjects("${KERNELDIR}/${${float_char}AMINKERNEL}" "USE_ABS;USE_MIN" "amin_k" false "" "" false ${float_type})
|
||||||
if (DEFINED ${float_char}MAXKERNEL)
|
if (DEFINED ${float_char}MAXKERNEL)
|
||||||
|
@ -93,6 +96,9 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
||||||
GenerateNamedObjects("generic/ger.c" "" "ger_k" false "" "" "" 3)
|
GenerateNamedObjects("generic/ger.c" "" "ger_k" false "" "" "" 3)
|
||||||
foreach (float_type ${FLOAT_TYPES})
|
foreach (float_type ${FLOAT_TYPES})
|
||||||
string(SUBSTRING ${float_type} 0 1 float_char)
|
string(SUBSTRING ${float_type} 0 1 float_char)
|
||||||
|
if (${float_type} STREQUAL "HALF")
|
||||||
|
set (float_char "SH")
|
||||||
|
endif ()
|
||||||
if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX")
|
if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX")
|
||||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GERUKERNEL}" "" "geru_k" false "" "" false ${float_type})
|
GenerateNamedObjects("${KERNELDIR}/${${float_char}GERUKERNEL}" "" "geru_k" false "" "" false ${float_type})
|
||||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GERCKERNEL}" "CONJ" "gerc_k" false "" "" false ${float_type})
|
GenerateNamedObjects("${KERNELDIR}/${${float_char}GERCKERNEL}" "CONJ" "gerc_k" false "" "" false ${float_type})
|
||||||
|
@ -124,17 +130,27 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
||||||
if (ARM OR ARM64 OR (TARGET_CORE MATCHES LONGSOON3B) OR (TARGET_CORE MATCHES GENERIC) OR (TARGET_CORE MATCHES HASWELL) OR (TARGET_CORE MATCHES ZEN) OR (TARGET_CORE MATCHES SKYLAKEX) )
|
if (ARM OR ARM64 OR (TARGET_CORE MATCHES LONGSOON3B) OR (TARGET_CORE MATCHES GENERIC) OR (TARGET_CORE MATCHES HASWELL) OR (TARGET_CORE MATCHES ZEN) OR (TARGET_CORE MATCHES SKYLAKEX) )
|
||||||
set(USE_TRMM true)
|
set(USE_TRMM true)
|
||||||
endif ()
|
endif ()
|
||||||
if (ZARCH OR (TARGET_CORE MATCHES POWER8) OR (TARGET_CORE MATCHES POWER9))
|
if (ZARCH OR (TARGET_CORE MATCHES POWER8) OR (TARGET_CORE MATCHES POWER9) OR (TARGET_CORE MATCHES POWER10))
|
||||||
set(USE_TRMM true)
|
set(USE_TRMM true)
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
foreach (float_type SINGLE DOUBLE)
|
foreach (float_type SINGLE DOUBLE HALF)
|
||||||
string(SUBSTRING ${float_type} 0 1 float_char)
|
string(SUBSTRING ${float_type} 0 1 float_char)
|
||||||
|
if (${float_type} STREQUAL "HALF")
|
||||||
|
if (NOT ${BUILD_HALF})
|
||||||
|
continue ()
|
||||||
|
else ()
|
||||||
|
set (float_char "SH")
|
||||||
|
endif ()
|
||||||
|
endif ()
|
||||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type})
|
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type})
|
||||||
endforeach()
|
endforeach()
|
||||||
|
|
||||||
foreach (float_type ${FLOAT_TYPES})
|
foreach (float_type ${FLOAT_TYPES})
|
||||||
string(SUBSTRING ${float_type} 0 1 float_char)
|
string(SUBSTRING ${float_type} 0 1 float_char)
|
||||||
|
if (${float_type} STREQUAL "HALF")
|
||||||
|
set (float_char "SH")
|
||||||
|
endif ()
|
||||||
if (${float_char}GEMMINCOPY)
|
if (${float_char}GEMMINCOPY)
|
||||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMINCOPY}" "${float_type}" "${${float_char}GEMMINCOPYOBJ}" false "" "" true ${float_type})
|
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMINCOPY}" "${float_type}" "${${float_char}GEMMINCOPYOBJ}" false "" "" true ${float_type})
|
||||||
endif ()
|
endif ()
|
||||||
|
@ -470,9 +486,13 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
||||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEADD_KERNEL}" "" "geadd_k" false "" "" false ${float_type})
|
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEADD_KERNEL}" "" "geadd_k" false "" "" false ${float_type})
|
||||||
endforeach ()
|
endforeach ()
|
||||||
|
|
||||||
|
|
||||||
# Makefile.LA
|
# Makefile.LA
|
||||||
if(NOT NO_LAPACK)
|
if(NOT NO_LAPACK)
|
||||||
foreach (float_type ${FLOAT_TYPES})
|
foreach (float_type ${FLOAT_TYPES})
|
||||||
|
if (${float_type} STREQUAL "HALF")
|
||||||
|
set (float_char "SH")
|
||||||
|
endif ()
|
||||||
if (NOT DEFINED ${float_char}NEG_TCOPY)
|
if (NOT DEFINED ${float_char}NEG_TCOPY)
|
||||||
if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C" OR ${float_char} STREQUAL "X")
|
if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C" OR ${float_char} STREQUAL "X")
|
||||||
set(${float_char}NEG_TCOPY ../generic/zneg_tcopy.c)
|
set(${float_char}NEG_TCOPY ../generic/zneg_tcopy.c)
|
||||||
|
@ -516,6 +536,9 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
||||||
foreach (float_type ${FLOAT_TYPES})
|
foreach (float_type ${FLOAT_TYPES})
|
||||||
# a bit of metaprogramming here to pull out the appropriate KERNEL var
|
# a bit of metaprogramming here to pull out the appropriate KERNEL var
|
||||||
string(SUBSTRING ${float_type} 0 1 float_char)
|
string(SUBSTRING ${float_type} 0 1 float_char)
|
||||||
|
if (${float_type} STREQUAL "HALF")
|
||||||
|
set (float_char "SH")
|
||||||
|
endif ()
|
||||||
GenerateNamedObjects("generic/neg_tcopy_${${float_char}GEMM_UNROLL_M}.c" "" "neg_tcopy" false "" ${TSUFFIX} false ${float_type})
|
GenerateNamedObjects("generic/neg_tcopy_${${float_char}GEMM_UNROLL_M}.c" "" "neg_tcopy" false "" ${TSUFFIX} false ${float_type})
|
||||||
GenerateNamedObjects("generic/laswp_ncopy_${${float_char}GEMM_UNROLL_N}.c" "" "laswp_ncopy" false "" ${TSUFFIX} false ${float_type})
|
GenerateNamedObjects("generic/laswp_ncopy_${${float_char}GEMM_UNROLL_N}.c" "" "laswp_ncopy" false "" ${TSUFFIX} false ${float_type})
|
||||||
endforeach ()
|
endforeach ()
|
||||||
|
|
|
@ -51,6 +51,10 @@ ifeq ($(CORE), POWER9)
|
||||||
USE_TRMM = 1
|
USE_TRMM = 1
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(CORE), POWER10)
|
||||||
|
USE_TRMM = 1
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq ($(ARCH), zarch)
|
ifeq ($(ARCH), zarch)
|
||||||
USE_TRMM = 1
|
USE_TRMM = 1
|
||||||
endif
|
endif
|
||||||
|
@ -59,6 +63,25 @@ ifeq ($(CORE), Z14)
|
||||||
USE_TRMM = 1
|
USE_TRMM = 1
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(BUILD_HALF), 1)
|
||||||
|
ifndef SHGEMMKERNEL
|
||||||
|
SHGEMM_BETA = ../generic/gemm_beta.c
|
||||||
|
SHGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||||
|
SHGEMMINCOPY = ../generic/gemm_ncopy_2.c
|
||||||
|
SHGEMMITCOPY = ../generic/gemm_tcopy_2.c
|
||||||
|
SHGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||||
|
SHGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||||
|
SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
endif
|
||||||
|
|
||||||
|
SHKERNELOBJS += \
|
||||||
|
shgemm_kernel$(TSUFFIX).$(SUFFIX) \
|
||||||
|
$(SHGEMMINCOPYOBJ) $(SHGEMMITCOPYOBJ) \
|
||||||
|
$(SHGEMMONCOPYOBJ) $(SHGEMMOTCOPYOBJ)
|
||||||
|
endif
|
||||||
|
|
||||||
SKERNELOBJS += \
|
SKERNELOBJS += \
|
||||||
sgemm_kernel$(TSUFFIX).$(SUFFIX) \
|
sgemm_kernel$(TSUFFIX).$(SUFFIX) \
|
||||||
|
@ -93,6 +116,9 @@ XKERNELOBJS += \
|
||||||
$(XGEMMINCOPYOBJ) $(XGEMMITCOPYOBJ) \
|
$(XGEMMINCOPYOBJ) $(XGEMMITCOPYOBJ) \
|
||||||
$(XGEMMONCOPYOBJ) $(XGEMMOTCOPYOBJ)
|
$(XGEMMONCOPYOBJ) $(XGEMMOTCOPYOBJ)
|
||||||
|
|
||||||
|
ifeq ($(BUILD_HALF),1)
|
||||||
|
SHBLASOBJS += $(SHKERNELOBJS)
|
||||||
|
endif
|
||||||
SBLASOBJS += $(SKERNELOBJS)
|
SBLASOBJS += $(SKERNELOBJS)
|
||||||
DBLASOBJS += $(DKERNELOBJS)
|
DBLASOBJS += $(DKERNELOBJS)
|
||||||
QBLASOBJS += $(QKERNELOBJS)
|
QBLASOBJS += $(QKERNELOBJS)
|
||||||
|
@ -100,6 +126,10 @@ CBLASOBJS += $(CKERNELOBJS)
|
||||||
ZBLASOBJS += $(ZKERNELOBJS)
|
ZBLASOBJS += $(ZKERNELOBJS)
|
||||||
XBLASOBJS += $(XKERNELOBJS)
|
XBLASOBJS += $(XKERNELOBJS)
|
||||||
|
|
||||||
|
ifeq ($(BUILD_HALF),1)
|
||||||
|
SHBLASOBJS += shgemm_beta$(TSUFFIX).$(SUFFIX)
|
||||||
|
endif
|
||||||
|
|
||||||
SBLASOBJS += \
|
SBLASOBJS += \
|
||||||
sgemm_beta$(TSUFFIX).$(SUFFIX) \
|
sgemm_beta$(TSUFFIX).$(SUFFIX) \
|
||||||
strmm_kernel_LN$(TSUFFIX).$(SUFFIX) strmm_kernel_LT$(TSUFFIX).$(SUFFIX) \
|
strmm_kernel_LN$(TSUFFIX).$(SUFFIX) strmm_kernel_LT$(TSUFFIX).$(SUFFIX) \
|
||||||
|
@ -389,6 +419,12 @@ ZBLASOBJS += \
|
||||||
zimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \
|
zimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \
|
||||||
zgeadd_k$(TSUFFIX).$(SUFFIX)
|
zgeadd_k$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
|
ifeq ($(BUILD_HALF), 1)
|
||||||
|
SHGEMMINCOPYOBJ_P = $(SHGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||||
|
SHGEMMITCOPYOBJ_P = $(SHGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||||
|
SHGEMMONCOPYOBJ_P = $(SHGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||||
|
SHGEMMOTCOPYOBJ_P = $(SHGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||||
|
endif
|
||||||
|
|
||||||
SGEMMINCOPYOBJ_P = $(SGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
SGEMMINCOPYOBJ_P = $(SGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||||
SGEMMITCOPYOBJ_P = $(SGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
SGEMMITCOPYOBJ_P = $(SGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||||
|
@ -415,6 +451,11 @@ XGEMMITCOPYOBJ_P = $(XGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||||
XGEMMONCOPYOBJ_P = $(XGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
XGEMMONCOPYOBJ_P = $(XGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||||
XGEMMOTCOPYOBJ_P = $(XGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
XGEMMOTCOPYOBJ_P = $(XGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||||
|
|
||||||
|
ifeq ($(BUILD_HALF),1)
|
||||||
|
$(KDIR)shgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_BETA)
|
||||||
|
$(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
$(KDIR)sgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_BETA)
|
$(KDIR)sgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_BETA)
|
||||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||||
|
|
||||||
|
@ -433,12 +474,47 @@ $(KDIR)zgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA)
|
||||||
$(KDIR)xgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMM_BETA)
|
$(KDIR)xgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMM_BETA)
|
||||||
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@
|
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@
|
||||||
|
|
||||||
|
|
||||||
|
ifeq ($(BUILD_HALF), 1)
|
||||||
|
|
||||||
|
$(KDIR)$(SHGEMMONCOPYOBJ) : $(KERNELDIR)/$(SHGEMMONCOPY)
|
||||||
|
$(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@
|
||||||
|
|
||||||
|
$(KDIR)$(SHGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SHGEMMOTCOPY)
|
||||||
|
|
||||||
|
ifeq ($(OS), AIX)
|
||||||
|
$(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX $< -o - > shgemmotcopy.s
|
||||||
|
m4 shgemmotcopy.s > shgemmotcopy_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemmotcopy_nomacros.s -o $@
|
||||||
|
rm shgemmotcopy.s shgemmotcopy_nomacros.s
|
||||||
|
else
|
||||||
|
$(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N))
|
||||||
|
|
||||||
|
$(KDIR)$(SHGEMMINCOPYOBJ) : $(KERNELDIR)/$(SHGEMMINCOPY)
|
||||||
|
$(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@
|
||||||
|
|
||||||
|
$(KDIR)$(SHGEMMITCOPYOBJ) : $(KERNELDIR)/$(SHGEMMITCOPY)
|
||||||
|
ifeq ($(OS), AIX)
|
||||||
|
$(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX $< -o - > shgemmitcopy.s
|
||||||
|
m4 shgemmitcopy.s > shgemmitcopy_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemmitcopy_nomacros.s -o $@
|
||||||
|
rm shgemmitcopy.s shgemmitcopy_nomacros.s
|
||||||
|
else
|
||||||
|
$(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
$(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY)
|
$(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY)
|
||||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||||
|
|
||||||
$(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY)
|
$(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY)
|
||||||
ifeq ($(OS), AIX)
|
ifeq ($(OS), AIX)
|
||||||
$(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmotcopy.s
|
$(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmotcopy.s
|
||||||
m4 sgemmotcopy.s > sgemmotcopy_nomacros.s
|
m4 sgemmotcopy.s > sgemmotcopy_nomacros.s
|
||||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmotcopy_nomacros.s -o $@
|
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmotcopy_nomacros.s -o $@
|
||||||
rm sgemmotcopy.s sgemmotcopy_nomacros.s
|
rm sgemmotcopy.s sgemmotcopy_nomacros.s
|
||||||
|
@ -454,7 +530,7 @@ $(KDIR)$(SGEMMINCOPYOBJ) : $(KERNELDIR)/$(SGEMMINCOPY)
|
||||||
|
|
||||||
$(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY)
|
$(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY)
|
||||||
ifeq ($(OS), AIX)
|
ifeq ($(OS), AIX)
|
||||||
$(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmitcopy.s
|
$(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmitcopy.s
|
||||||
m4 sgemmitcopy.s > sgemmitcopy_nomacros.s
|
m4 sgemmitcopy.s > sgemmitcopy_nomacros.s
|
||||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmitcopy_nomacros.s -o $@
|
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmitcopy_nomacros.s -o $@
|
||||||
rm sgemmitcopy.s sgemmitcopy_nomacros.s
|
rm sgemmitcopy.s sgemmitcopy_nomacros.s
|
||||||
|
@ -466,7 +542,7 @@ endif
|
||||||
|
|
||||||
$(KDIR)$(DGEMMONCOPYOBJ) : $(KERNELDIR)/$(DGEMMONCOPY)
|
$(KDIR)$(DGEMMONCOPYOBJ) : $(KERNELDIR)/$(DGEMMONCOPY)
|
||||||
ifeq ($(OS), AIX)
|
ifeq ($(OS), AIX)
|
||||||
$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_ncopy.s
|
$(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_ncopy.s
|
||||||
m4 dgemm_ncopy.s > dgemm_ncopy_nomacros.s
|
m4 dgemm_ncopy.s > dgemm_ncopy_nomacros.s
|
||||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_ncopy_nomacros.s -o $@
|
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_ncopy_nomacros.s -o $@
|
||||||
rm dgemm_ncopy.s dgemm_ncopy_nomacros.s
|
rm dgemm_ncopy.s dgemm_ncopy_nomacros.s
|
||||||
|
@ -484,7 +560,7 @@ $(KDIR)$(DGEMMINCOPYOBJ) : $(KERNELDIR)/$(DGEMMINCOPY)
|
||||||
|
|
||||||
$(KDIR)$(DGEMMITCOPYOBJ) : $(KERNELDIR)/$(DGEMMITCOPY)
|
$(KDIR)$(DGEMMITCOPYOBJ) : $(KERNELDIR)/$(DGEMMITCOPY)
|
||||||
ifeq ($(OS), AIX)
|
ifeq ($(OS), AIX)
|
||||||
$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_itcopy.s
|
$(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_itcopy.s
|
||||||
m4 dgemm_itcopy.s > dgemm_itcopy_nomacros.s
|
m4 dgemm_itcopy.s > dgemm_itcopy_nomacros.s
|
||||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_itcopy_nomacros.s -o $@
|
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_itcopy_nomacros.s -o $@
|
||||||
rm dgemm_itcopy.s dgemm_itcopy_nomacros.s
|
rm dgemm_itcopy.s dgemm_itcopy_nomacros.s
|
||||||
|
@ -527,7 +603,7 @@ $(KDIR)$(CGEMMINCOPYOBJ) : $(KERNELDIR)/$(CGEMMINCOPY)
|
||||||
|
|
||||||
$(KDIR)$(CGEMMITCOPYOBJ) : $(KERNELDIR)/$(CGEMMITCOPY)
|
$(KDIR)$(CGEMMITCOPYOBJ) : $(KERNELDIR)/$(CGEMMITCOPY)
|
||||||
ifeq ($(OS), AIX)
|
ifeq ($(OS), AIX)
|
||||||
$(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -E $< -o cgemm_itcopy.s
|
$(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -S $< -o - > cgemm_itcopy.s
|
||||||
m4 cgemm_itcopy.s > cgemm_itcopy_nomacros.s
|
m4 cgemm_itcopy.s > cgemm_itcopy_nomacros.s
|
||||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX cgemm_itcopy_nomacros.s -o $@
|
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX cgemm_itcopy_nomacros.s -o $@
|
||||||
rm cgemm_itcopy.s cgemm_itcopy_nomacros.s
|
rm cgemm_itcopy.s cgemm_itcopy_nomacros.s
|
||||||
|
@ -550,7 +626,7 @@ $(KDIR)$(ZGEMMINCOPYOBJ) : $(KERNELDIR)/$(ZGEMMINCOPY)
|
||||||
|
|
||||||
$(KDIR)$(ZGEMMITCOPYOBJ) : $(KERNELDIR)/$(ZGEMMITCOPY)
|
$(KDIR)$(ZGEMMITCOPYOBJ) : $(KERNELDIR)/$(ZGEMMITCOPY)
|
||||||
ifeq ($(OS), AIX)
|
ifeq ($(OS), AIX)
|
||||||
$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o zgemm_itcopy.s
|
$(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > zgemm_itcopy.s
|
||||||
m4 zgemm_itcopy.s > zgemm_itcopy_nomacros.s
|
m4 zgemm_itcopy.s > zgemm_itcopy_nomacros.s
|
||||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX zgemm_itcopy_nomacros.s -o $@
|
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX zgemm_itcopy_nomacros.s -o $@
|
||||||
rm zgemm_itcopy.s zgemm_itcopy_nomacros.s
|
rm zgemm_itcopy.s zgemm_itcopy_nomacros.s
|
||||||
|
@ -582,7 +658,7 @@ endif
|
||||||
|
|
||||||
$(KDIR)sgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND)
|
$(KDIR)sgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND)
|
||||||
ifeq ($(OS), AIX)
|
ifeq ($(OS), AIX)
|
||||||
$(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemm_kernel$(TSUFFIX).s
|
$(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemm_kernel$(TSUFFIX).s
|
||||||
m4 sgemm_kernel$(TSUFFIX).s > sgemm_kernel$(TSUFFIX)_nomacros.s
|
m4 sgemm_kernel$(TSUFFIX).s > sgemm_kernel$(TSUFFIX)_nomacros.s
|
||||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemm_kernel$(TSUFFIX)_nomacros.s -o $@
|
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemm_kernel$(TSUFFIX)_nomacros.s -o $@
|
||||||
rm sgemm_kernel$(TSUFFIX).s sgemm_kernel$(TSUFFIX)_nomacros.s
|
rm sgemm_kernel$(TSUFFIX).s sgemm_kernel$(TSUFFIX)_nomacros.s
|
||||||
|
@ -590,9 +666,22 @@ else
|
||||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(BUILD_HALF), 1)
|
||||||
|
|
||||||
|
$(KDIR)shgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND)
|
||||||
|
ifeq ($(OS), AIX)
|
||||||
|
$(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX $< -o - > shgemm_kernel$(TSUFFIX).s
|
||||||
|
m4 shgemm_kernel$(TSUFFIX).s > shgemm_kernel$(TSUFFIX)_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemm_kernel$(TSUFFIX)_nomacros.s -o $@
|
||||||
|
rm shgemm_kernel$(TSUFFIX).s shgemm_kernel$(TSUFFIX)_nomacros.s
|
||||||
|
else
|
||||||
|
$(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
$(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND)
|
$(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND)
|
||||||
ifeq ($(OS), AIX)
|
ifeq ($(OS), AIX)
|
||||||
$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_kernel$(TSUFFIX).s
|
$(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_kernel$(TSUFFIX).s
|
||||||
m4 dgemm_kernel$(TSUFFIX).s > dgemm_kernel$(TSUFFIX)_nomacros.s
|
m4 dgemm_kernel$(TSUFFIX).s > dgemm_kernel$(TSUFFIX)_nomacros.s
|
||||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_kernel$(TSUFFIX)_nomacros.s -o $@
|
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_kernel$(TSUFFIX)_nomacros.s -o $@
|
||||||
rm dgemm_kernel$(TSUFFIX).s dgemm_kernel$(TSUFFIX)_nomacros.s
|
rm dgemm_kernel$(TSUFFIX).s dgemm_kernel$(TSUFFIX)_nomacros.s
|
||||||
|
@ -605,7 +694,7 @@ $(KDIR)qgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEP
|
||||||
|
|
||||||
$(KDIR)cgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
$(KDIR)cgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
||||||
ifeq ($(OS), AIX)
|
ifeq ($(OS), AIX)
|
||||||
$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNN $< -o cgemm_kernel_n.s
|
$(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DNN $< -o - > cgemm_kernel_n.s
|
||||||
m4 cgemm_kernel_n.s > cgemm_kernel_n_nomacros.s
|
m4 cgemm_kernel_n.s > cgemm_kernel_n_nomacros.s
|
||||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN cgemm_kernel_n_nomacros.s -o $@
|
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN cgemm_kernel_n_nomacros.s -o $@
|
||||||
rm cgemm_kernel_n.s cgemm_kernel_n_nomacros.s
|
rm cgemm_kernel_n.s cgemm_kernel_n_nomacros.s
|
||||||
|
@ -615,7 +704,7 @@ endif
|
||||||
|
|
||||||
$(KDIR)cgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
$(KDIR)cgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
||||||
ifeq ($(OS), AIX)
|
ifeq ($(OS), AIX)
|
||||||
$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCN $< -o cgemm_kernel_l.s
|
$(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DCN $< -o - > cgemm_kernel_l.s
|
||||||
m4 cgemm_kernel_l.s > cgemm_kernel_l_nomacros.s
|
m4 cgemm_kernel_l.s > cgemm_kernel_l_nomacros.s
|
||||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN cgemm_kernel_l_nomacros.s -o $@
|
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN cgemm_kernel_l_nomacros.s -o $@
|
||||||
rm cgemm_kernel_l.s cgemm_kernel_l_nomacros.s
|
rm cgemm_kernel_l.s cgemm_kernel_l_nomacros.s
|
||||||
|
@ -625,7 +714,7 @@ endif
|
||||||
|
|
||||||
$(KDIR)cgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
$(KDIR)cgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
||||||
ifeq ($(OS), AIX)
|
ifeq ($(OS), AIX)
|
||||||
$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s
|
$(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DNC $< -o - > cgemm_kernel_r.s
|
||||||
m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s
|
m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s
|
||||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@
|
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@
|
||||||
rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s
|
rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s
|
||||||
|
@ -635,7 +724,7 @@ endif
|
||||||
|
|
||||||
$(KDIR)cgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
$(KDIR)cgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
||||||
ifeq ($(OS), AIX)
|
ifeq ($(OS), AIX)
|
||||||
$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCC $< -o cgemm_kernel_b.s
|
$(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DCC $< -o - > cgemm_kernel_b.s
|
||||||
m4 cgemm_kernel_b.s > cgemm_kernel_b_nomacros.s
|
m4 cgemm_kernel_b.s > cgemm_kernel_b_nomacros.s
|
||||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC cgemm_kernel_b_nomacros.s -o $@
|
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC cgemm_kernel_b_nomacros.s -o $@
|
||||||
rm cgemm_kernel_b.s cgemm_kernel_b_nomacros.s
|
rm cgemm_kernel_b.s cgemm_kernel_b_nomacros.s
|
||||||
|
@ -645,7 +734,7 @@ endif
|
||||||
|
|
||||||
$(KDIR)zgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
|
$(KDIR)zgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
|
||||||
ifeq ($(OS), AIX)
|
ifeq ($(OS), AIX)
|
||||||
$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNN $< -o zgemm_kernel_n.s
|
$(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DNN $< -o - > zgemm_kernel_n.s
|
||||||
m4 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s
|
m4 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s
|
||||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@
|
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@
|
||||||
rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s
|
rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s
|
||||||
|
@ -655,7 +744,7 @@ endif
|
||||||
|
|
||||||
$(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
|
$(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
|
||||||
ifeq ($(OS), AIX)
|
ifeq ($(OS), AIX)
|
||||||
$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCN $< -o zgemm_kernel_l.s
|
$(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DCN $< -o - > zgemm_kernel_l.s
|
||||||
m4 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s
|
m4 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s
|
||||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@
|
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@
|
||||||
rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s
|
rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s
|
||||||
|
@ -665,7 +754,7 @@ endif
|
||||||
|
|
||||||
$(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
|
$(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
|
||||||
ifeq ($(OS), AIX)
|
ifeq ($(OS), AIX)
|
||||||
$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNC $< -o zgemm_kernel_r.s
|
$(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DNC $< -o - > zgemm_kernel_r.s
|
||||||
m4 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s
|
m4 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s
|
||||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@
|
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@
|
||||||
rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s
|
rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s
|
||||||
|
@ -675,7 +764,7 @@ endif
|
||||||
|
|
||||||
$(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
|
$(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
|
||||||
ifeq ($(OS), AIX)
|
ifeq ($(OS), AIX)
|
||||||
$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCC $< -o zgemm_kernel_b.s
|
$(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DCC $< -o - > zgemm_kernel_b.s
|
||||||
m4 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s
|
m4 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s
|
||||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@
|
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@
|
||||||
rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s
|
rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s
|
||||||
|
@ -699,7 +788,7 @@ $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD
|
||||||
ifdef USE_TRMM
|
ifdef USE_TRMM
|
||||||
$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
||||||
ifeq ($(OS), AIX)
|
ifeq ($(OS), AIX)
|
||||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o strmmkernel_ln.s
|
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o - > strmmkernel_ln.s
|
||||||
m4 strmmkernel_ln.s > strmmkernel_ln_nomacros.s
|
m4 strmmkernel_ln.s > strmmkernel_ln_nomacros.s
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA strmmkernel_ln_nomacros.s -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA strmmkernel_ln_nomacros.s -o $@
|
||||||
rm strmmkernel_ln.s strmmkernel_ln_nomacros.s
|
rm strmmkernel_ln.s strmmkernel_ln_nomacros.s
|
||||||
|
@ -709,7 +798,7 @@ endif
|
||||||
|
|
||||||
$(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
$(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
||||||
ifeq ($(OS), AIX)
|
ifeq ($(OS), AIX)
|
||||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o strmmkernel_lt.s
|
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o - > strmmkernel_lt.s
|
||||||
m4 strmmkernel_lt.s > strmmkernel_lt_nomacros.s
|
m4 strmmkernel_lt.s > strmmkernel_lt_nomacros.s
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA strmmkernel_lt_nomacros.s -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA strmmkernel_lt_nomacros.s -o $@
|
||||||
rm strmmkernel_lt.s strmmkernel_lt_nomacros.s
|
rm strmmkernel_lt.s strmmkernel_lt_nomacros.s
|
||||||
|
@ -719,7 +808,7 @@ endif
|
||||||
|
|
||||||
$(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
$(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
||||||
ifeq ($(OS), AIX)
|
ifeq ($(OS), AIX)
|
||||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o strmmkernel_rn.s
|
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o - > strmmkernel_rn.s
|
||||||
m4 strmmkernel_rn.s > strmmkernel_rn_nomacros.s
|
m4 strmmkernel_rn.s > strmmkernel_rn_nomacros.s
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA strmmkernel_rn_nomacros.s -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA strmmkernel_rn_nomacros.s -o $@
|
||||||
rm strmmkernel_rn.s strmmkernel_rn_nomacros.s
|
rm strmmkernel_rn.s strmmkernel_rn_nomacros.s
|
||||||
|
@ -729,7 +818,7 @@ endif
|
||||||
|
|
||||||
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
||||||
ifeq ($(OS), AIX)
|
ifeq ($(OS), AIX)
|
||||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s
|
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s
|
||||||
m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s
|
m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
|
||||||
rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
|
rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
|
||||||
|
@ -739,7 +828,7 @@ endif
|
||||||
|
|
||||||
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
||||||
ifeq ($(OS), AIX)
|
ifeq ($(OS), AIX)
|
||||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o dtrmm_kernel_ln.s
|
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o - > dtrmm_kernel_ln.s
|
||||||
m4 dtrmm_kernel_ln.s > dtrmm_kernel_ln_nomacros.s
|
m4 dtrmm_kernel_ln.s > dtrmm_kernel_ln_nomacros.s
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA dtrmm_kernel_ln_nomacros.s -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA dtrmm_kernel_ln_nomacros.s -o $@
|
||||||
rm dtrmm_kernel_ln.s dtrmm_kernel_ln_nomacros.s
|
rm dtrmm_kernel_ln.s dtrmm_kernel_ln_nomacros.s
|
||||||
|
@ -749,7 +838,7 @@ endif
|
||||||
|
|
||||||
$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
||||||
ifeq ($(OS), AIX)
|
ifeq ($(OS), AIX)
|
||||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o dtrmm_kernel_lt.s
|
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o - > dtrmm_kernel_lt.s
|
||||||
m4 dtrmm_kernel_lt.s > dtrmm_kernel_lt_nomacros.s
|
m4 dtrmm_kernel_lt.s > dtrmm_kernel_lt_nomacros.s
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA dtrmm_kernel_lt_nomacros.s -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA dtrmm_kernel_lt_nomacros.s -o $@
|
||||||
rm dtrmm_kernel_lt.s dtrmm_kernel_lt_nomacros.s
|
rm dtrmm_kernel_lt.s dtrmm_kernel_lt_nomacros.s
|
||||||
|
@ -759,7 +848,7 @@ endif
|
||||||
|
|
||||||
$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
||||||
ifeq ($(OS), AIX)
|
ifeq ($(OS), AIX)
|
||||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o dtrmm_kernel_rn.s
|
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o - > dtrmm_kernel_rn.s
|
||||||
m4 dtrmm_kernel_rn.s > dtrmm_kernel_rn_nomacros.s
|
m4 dtrmm_kernel_rn.s > dtrmm_kernel_rn_nomacros.s
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA dtrmm_kernel_rn_nomacros.s -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA dtrmm_kernel_rn_nomacros.s -o $@
|
||||||
rm dtrmm_kernel_rn.s dtrmm_kernel_rn_nomacros.s
|
rm dtrmm_kernel_rn.s dtrmm_kernel_rn_nomacros.s
|
||||||
|
@ -769,7 +858,7 @@ endif
|
||||||
|
|
||||||
$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
||||||
ifeq ($(OS), AIX)
|
ifeq ($(OS), AIX)
|
||||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o dtrmm_kernel_rt.s
|
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > dtrmm_kernel_rt.s
|
||||||
m4 dtrmm_kernel_rt.s > dtrmm_kernel_rt_nomacros.s
|
m4 dtrmm_kernel_rt.s > dtrmm_kernel_rt_nomacros.s
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA dtrmm_kernel_rt_nomacros.s -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA dtrmm_kernel_rt_nomacros.s -o $@
|
||||||
rm dtrmm_kernel_rt.s dtrmm_kernel_rt_nomacros.s
|
rm dtrmm_kernel_rt.s dtrmm_kernel_rt_nomacros.s
|
||||||
|
@ -791,7 +880,7 @@ $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
|
||||||
|
|
||||||
$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||||
ifeq ($(OS), AIX)
|
ifeq ($(OS), AIX)
|
||||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_ln.s
|
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_ln.s
|
||||||
m4 ctrmm_kernel_ln.s > ctrmm_kernel_ln_nomacros.s
|
m4 ctrmm_kernel_ln.s > ctrmm_kernel_ln_nomacros.s
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_ln_nomacros.s -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_ln_nomacros.s -o $@
|
||||||
rm ctrmm_kernel_ln.s ctrmm_kernel_ln_nomacros.s
|
rm ctrmm_kernel_ln.s ctrmm_kernel_ln_nomacros.s
|
||||||
|
@ -801,7 +890,7 @@ endif
|
||||||
|
|
||||||
$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||||
ifeq ($(OS), AIX)
|
ifeq ($(OS), AIX)
|
||||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_lt.s
|
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_lt.s
|
||||||
m4 ctrmm_kernel_lt.s > ctrmm_kernel_lt_nomacros.s
|
m4 ctrmm_kernel_lt.s > ctrmm_kernel_lt_nomacros.s
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_lt_nomacros.s -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_lt_nomacros.s -o $@
|
||||||
rm ctrmm_kernel_lt.s ctrmm_kernel_lt_nomacros.s
|
rm ctrmm_kernel_lt.s ctrmm_kernel_lt_nomacros.s
|
||||||
|
@ -811,7 +900,7 @@ endif
|
||||||
|
|
||||||
$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||||
ifeq ($(OS), AIX)
|
ifeq ($(OS), AIX)
|
||||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lr.s
|
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o - > ctrmm_kernel_lr.s
|
||||||
m4 ctrmm_kernel_lr.s > ctrmm_kernel_lr_nomacros.s
|
m4 ctrmm_kernel_lr.s > ctrmm_kernel_lr_nomacros.s
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ctrmm_kernel_lr_nomacros.s -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ctrmm_kernel_lr_nomacros.s -o $@
|
||||||
rm ctrmm_kernel_lr.s ctrmm_kernel_lr_nomacros.s
|
rm ctrmm_kernel_lr.s ctrmm_kernel_lr_nomacros.s
|
||||||
|
@ -821,7 +910,7 @@ endif
|
||||||
|
|
||||||
$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||||
ifeq ($(OS), AIX)
|
ifeq ($(OS), AIX)
|
||||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lc.s
|
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o - > ctrmm_kernel_lc.s
|
||||||
m4 ctrmm_kernel_lc.s > ctrmm_kernel_lc_nomacros.s
|
m4 ctrmm_kernel_lc.s > ctrmm_kernel_lc_nomacros.s
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ctrmm_kernel_lc_nomacros.s -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ctrmm_kernel_lc_nomacros.s -o $@
|
||||||
rm ctrmm_kernel_lc_nomacros.s ctrmm_kernel_lc.s
|
rm ctrmm_kernel_lc_nomacros.s ctrmm_kernel_lc.s
|
||||||
|
@ -831,7 +920,7 @@ endif
|
||||||
|
|
||||||
$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||||
ifeq ($(OS), AIX)
|
ifeq ($(OS), AIX)
|
||||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rn.s
|
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_rn.s
|
||||||
m4 ctrmm_kernel_rn.s > ctrmm_kernel_rn_nomacros.s
|
m4 ctrmm_kernel_rn.s > ctrmm_kernel_rn_nomacros.s
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_rn_nomacros.s -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_rn_nomacros.s -o $@
|
||||||
rm ctrmm_kernel_rn.s ctrmm_kernel_rn_nomacros.s
|
rm ctrmm_kernel_rn.s ctrmm_kernel_rn_nomacros.s
|
||||||
|
@ -841,7 +930,7 @@ endif
|
||||||
|
|
||||||
$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||||
ifeq ($(OS), AIX)
|
ifeq ($(OS), AIX)
|
||||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rt.s
|
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_rt.s
|
||||||
m4 ctrmm_kernel_rt.s > ctrmm_kernel_rt_nomacros.s
|
m4 ctrmm_kernel_rt.s > ctrmm_kernel_rt_nomacros.s
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_rt_nomacros.s -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_rt_nomacros.s -o $@
|
||||||
rm ctrmm_kernel_rt.s ctrmm_kernel_rt_nomacros.s
|
rm ctrmm_kernel_rt.s ctrmm_kernel_rt_nomacros.s
|
||||||
|
@ -851,7 +940,7 @@ endif
|
||||||
|
|
||||||
$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||||
ifeq ($(OS), AIX)
|
ifeq ($(OS), AIX)
|
||||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ctrmm_kernel_rr.s
|
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o - > ctrmm_kernel_rr.s
|
||||||
m4 ctrmm_kernel_rr.s > ctrmm_kernel_rr_nomacros.s
|
m4 ctrmm_kernel_rr.s > ctrmm_kernel_rr_nomacros.s
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ctrmm_kernel_rr_nomacros.s -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ctrmm_kernel_rr_nomacros.s -o $@
|
||||||
rm ctrmm_kernel_rr.s ctrmm_kernel_rr_nomacros.s
|
rm ctrmm_kernel_rr.s ctrmm_kernel_rr_nomacros.s
|
||||||
|
@ -861,7 +950,7 @@ endif
|
||||||
|
|
||||||
$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||||
ifeq ($(OS), AIX)
|
ifeq ($(OS), AIX)
|
||||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ctrmm_kernel_RC.s
|
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o - > ctrmm_kernel_RC.s
|
||||||
m4 ctrmm_kernel_RC.s > ctrmm_kernel_RC_nomacros.s
|
m4 ctrmm_kernel_RC.s > ctrmm_kernel_RC_nomacros.s
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ctrmm_kernel_RC_nomacros.s -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ctrmm_kernel_RC_nomacros.s -o $@
|
||||||
rm ctrmm_kernel_RC.s ctrmm_kernel_RC_nomacros.s
|
rm ctrmm_kernel_RC.s ctrmm_kernel_RC_nomacros.s
|
||||||
|
@ -871,7 +960,7 @@ endif
|
||||||
|
|
||||||
$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||||
ifeq ($(OS), AIX)
|
ifeq ($(OS), AIX)
|
||||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_ln.s
|
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_ln.s
|
||||||
m4 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s
|
m4 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@
|
||||||
rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s
|
rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s
|
||||||
|
@ -881,7 +970,7 @@ endif
|
||||||
|
|
||||||
$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||||
ifeq ($(OS), AIX)
|
ifeq ($(OS), AIX)
|
||||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_lt.s
|
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_lt.s
|
||||||
m4 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s
|
m4 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@
|
||||||
rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s
|
rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s
|
||||||
|
@ -891,7 +980,7 @@ endif
|
||||||
|
|
||||||
$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||||
ifeq ($(OS), AIX)
|
ifeq ($(OS), AIX)
|
||||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lr.s
|
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o - > ztrmm_kernel_lr.s
|
||||||
m4 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s
|
m4 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@
|
||||||
rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s
|
rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s
|
||||||
|
@ -901,7 +990,7 @@ endif
|
||||||
|
|
||||||
$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||||
ifeq ($(OS), AIX)
|
ifeq ($(OS), AIX)
|
||||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lc.s
|
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o - > ztrmm_kernel_lc.s
|
||||||
m4 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s
|
m4 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@
|
||||||
rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s
|
rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s
|
||||||
|
@ -911,7 +1000,7 @@ endif
|
||||||
|
|
||||||
$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||||
ifeq ($(OS), AIX)
|
ifeq ($(OS), AIX)
|
||||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rn.s
|
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_rn.s
|
||||||
m4 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s
|
m4 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@
|
||||||
rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s
|
rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s
|
||||||
|
@ -921,7 +1010,7 @@ endif
|
||||||
|
|
||||||
$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||||
ifeq ($(OS), AIX)
|
ifeq ($(OS), AIX)
|
||||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rt.s
|
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_rt.s
|
||||||
m4 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s
|
m4 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@
|
||||||
rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s
|
rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s
|
||||||
|
@ -931,7 +1020,7 @@ endif
|
||||||
|
|
||||||
$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||||
ifeq ($(OS), AIX)
|
ifeq ($(OS), AIX)
|
||||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rr.s
|
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o - > ztrmm_kernel_rr.s
|
||||||
m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s
|
m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@
|
||||||
rm ztrmm_kernel_rr.s ztrmm_kernel_rr_nomacros.s
|
rm ztrmm_kernel_rr.s ztrmm_kernel_rr_nomacros.s
|
||||||
|
@ -941,7 +1030,7 @@ endif
|
||||||
|
|
||||||
$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||||
ifeq ($(OS), AIX)
|
ifeq ($(OS), AIX)
|
||||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rc.s
|
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o - > ztrmm_kernel_rc.s
|
||||||
m4 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s
|
m4 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@
|
||||||
rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s
|
rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s
|
||||||
|
@ -961,7 +1050,7 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
|
||||||
|
|
||||||
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
|
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
|
||||||
ifeq ($(OS), AIX)
|
ifeq ($(OS), AIX)
|
||||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s
|
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s
|
||||||
m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s
|
m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
|
||||||
rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
|
rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
|
||||||
|
@ -1095,7 +1184,7 @@ $(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DT
|
||||||
|
|
||||||
$(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND)
|
$(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND)
|
||||||
ifeq ($(OS), AIX)
|
ifeq ($(OS), AIX)
|
||||||
$(CC) $(CFLAGS) -E -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o dtrsm_kernel_lt.s
|
$(CC) $(CFLAGS) -S -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o - > dtrsm_kernel_lt.s
|
||||||
m4 dtrsm_kernel_lt.s > dtrsm_kernel_lt_nomacros.s
|
m4 dtrsm_kernel_lt.s > dtrsm_kernel_lt_nomacros.s
|
||||||
$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ dtrsm_kernel_lt_nomacros.s -o $@
|
$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ dtrsm_kernel_lt_nomacros.s -o $@
|
||||||
rm dtrsm_kernel_lt.s dtrsm_kernel_lt_nomacros.s
|
rm dtrsm_kernel_lt.s dtrsm_kernel_lt_nomacros.s
|
||||||
|
@ -2206,6 +2295,11 @@ $(KDIR)xtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_
|
||||||
$(KDIR)sgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMM_BETA)
|
$(KDIR)sgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMM_BETA)
|
||||||
$(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
$(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||||
|
|
||||||
|
ifeq ($(BUILD_HALF),1)
|
||||||
|
$(KDIR)shgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHGEMM_BETA)
|
||||||
|
$(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
$(KDIR)dgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMM_BETA)
|
$(KDIR)dgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMM_BETA)
|
||||||
$(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
$(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
||||||
|
|
||||||
|
@ -2221,6 +2315,24 @@ $(KDIR)zgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA)
|
||||||
$(KDIR)xgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMM_BETA)
|
$(KDIR)xgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMM_BETA)
|
||||||
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@
|
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@
|
||||||
|
|
||||||
|
|
||||||
|
ifeq ($(BUILD_HALF), 1)
|
||||||
|
$(SHGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMONCOPY)
|
||||||
|
$(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@
|
||||||
|
|
||||||
|
$(SHGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMOTCOPY)
|
||||||
|
$(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@
|
||||||
|
|
||||||
|
ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N))
|
||||||
|
$(SHGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMINCOPY)
|
||||||
|
$(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@
|
||||||
|
|
||||||
|
$(SHGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMITCOPY)
|
||||||
|
$(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@
|
||||||
|
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
$(SGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMONCOPY)
|
$(SGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMONCOPY)
|
||||||
$(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
$(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||||
|
|
||||||
|
@ -2325,6 +2437,12 @@ endif
|
||||||
|
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
|
||||||
|
ifeq ($(BUILD_HALF), 1)
|
||||||
|
$(KDIR)shgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND)
|
||||||
|
$(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
$(KDIR)sgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND)
|
$(KDIR)sgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND)
|
||||||
$(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
$(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||||
|
|
||||||
|
@ -2342,7 +2460,7 @@ $(KDIR)cgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMM
|
||||||
|
|
||||||
$(KDIR)cgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
$(KDIR)cgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
||||||
ifeq ($(OS), AIX)
|
ifeq ($(OS), AIX)
|
||||||
$(CC) $(PFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s
|
$(CC) $(PFLAGS) -S -UDOUBLE -DCOMPLEX -DNC $< -o - > cgemm_kernel_r.s
|
||||||
m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s
|
m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s
|
||||||
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@
|
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@
|
||||||
rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s
|
rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s
|
||||||
|
@ -2388,7 +2506,7 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
|
||||||
|
|
||||||
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
|
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
|
||||||
ifeq ($(OS), AIX)
|
ifeq ($(OS), AIX)
|
||||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s
|
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s
|
||||||
m4 strmmkernel_rn.s > strmm_kernel_rt_nomacros.s
|
m4 strmmkernel_rn.s > strmm_kernel_rt_nomacros.s
|
||||||
$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
|
$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
|
||||||
rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
|
rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
|
||||||
|
|
|
@ -1,3 +1,187 @@
|
||||||
include $(KERNELDIR)/KERNEL.ARMV8
|
SAMINKERNEL = ../arm/amin.c
|
||||||
|
DAMINKERNEL = ../arm/amin.c
|
||||||
|
CAMINKERNEL = ../arm/zamin.c
|
||||||
|
ZAMINKERNEL = ../arm/zamin.c
|
||||||
|
|
||||||
|
SMAXKERNEL = ../arm/max.c
|
||||||
|
DMAXKERNEL = ../arm/max.c
|
||||||
|
|
||||||
|
SMINKERNEL = ../arm/min.c
|
||||||
|
DMINKERNEL = ../arm/min.c
|
||||||
|
|
||||||
|
ISAMINKERNEL = ../arm/iamin.c
|
||||||
|
IDAMINKERNEL = ../arm/iamin.c
|
||||||
|
ICAMINKERNEL = ../arm/izamin.c
|
||||||
|
IZAMINKERNEL = ../arm/izamin.c
|
||||||
|
|
||||||
|
ISMAXKERNEL = ../arm/imax.c
|
||||||
|
IDMAXKERNEL = ../arm/imax.c
|
||||||
|
|
||||||
|
ISMINKERNEL = ../arm/imin.c
|
||||||
|
IDMINKERNEL = ../arm/imin.c
|
||||||
|
|
||||||
|
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
|
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
|
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
|
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
|
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
SAMAXKERNEL = amax.S
|
||||||
|
DAMAXKERNEL = amax.S
|
||||||
|
CAMAXKERNEL = zamax.S
|
||||||
|
ZAMAXKERNEL = zamax.S
|
||||||
|
|
||||||
|
SAXPYKERNEL = axpy.S
|
||||||
|
DAXPYKERNEL = axpy.S
|
||||||
|
CAXPYKERNEL = zaxpy.S
|
||||||
|
ZAXPYKERNEL = zaxpy.S
|
||||||
|
|
||||||
|
SROTKERNEL = rot.S
|
||||||
|
DROTKERNEL = rot.S
|
||||||
|
CROTKERNEL = zrot.S
|
||||||
|
ZROTKERNEL = zrot.S
|
||||||
|
|
||||||
|
SSCALKERNEL = scal.S
|
||||||
|
DSCALKERNEL = scal.S
|
||||||
|
CSCALKERNEL = zscal.S
|
||||||
|
ZSCALKERNEL = zscal.S
|
||||||
|
|
||||||
|
SGEMVNKERNEL = gemv_n.S
|
||||||
|
DGEMVNKERNEL = gemv_n.S
|
||||||
|
CGEMVNKERNEL = zgemv_n.S
|
||||||
|
ZGEMVNKERNEL = zgemv_n.S
|
||||||
|
|
||||||
|
SGEMVTKERNEL = gemv_t.S
|
||||||
|
DGEMVTKERNEL = gemv_t.S
|
||||||
|
CGEMVTKERNEL = zgemv_t.S
|
||||||
|
ZGEMVTKERNEL = zgemv_t.S
|
||||||
|
|
||||||
|
|
||||||
|
SASUMKERNEL = asum.S
|
||||||
|
DASUMKERNEL = asum.S
|
||||||
|
CASUMKERNEL = casum.S
|
||||||
|
ZASUMKERNEL = zasum.S
|
||||||
|
|
||||||
|
SCOPYKERNEL = copy.S
|
||||||
|
DCOPYKERNEL = copy.S
|
||||||
|
CCOPYKERNEL = copy.S
|
||||||
|
ZCOPYKERNEL = copy.S
|
||||||
|
|
||||||
|
SSWAPKERNEL = swap.S
|
||||||
|
DSWAPKERNEL = swap.S
|
||||||
|
CSWAPKERNEL = swap.S
|
||||||
|
ZSWAPKERNEL = swap.S
|
||||||
|
|
||||||
|
ISAMAXKERNEL = iamax.S
|
||||||
|
IDAMAXKERNEL = iamax.S
|
||||||
|
ICAMAXKERNEL = izamax.S
|
||||||
|
IZAMAXKERNEL = izamax.S
|
||||||
|
|
||||||
|
SNRM2KERNEL = nrm2.S
|
||||||
|
DNRM2KERNEL = nrm2.S
|
||||||
|
CNRM2KERNEL = znrm2.S
|
||||||
|
ZNRM2KERNEL = znrm2.S
|
||||||
|
|
||||||
|
DDOTKERNEL = dot.S
|
||||||
|
SDOTKERNEL = dot.S
|
||||||
|
CDOTKERNEL = zdot.S
|
||||||
|
ZDOTKERNEL = zdot.S
|
||||||
|
DSDOTKERNEL = dot.S
|
||||||
|
|
||||||
|
DGEMM_BETA = dgemm_beta.S
|
||||||
|
SGEMM_BETA = sgemm_beta.S
|
||||||
|
|
||||||
|
ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8)
|
||||||
|
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S
|
||||||
|
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S
|
||||||
|
else
|
||||||
|
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||||
|
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||||
|
endif
|
||||||
|
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||||
|
ifeq ($(SGEMM_UNROLL_M), 16)
|
||||||
|
SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S
|
||||||
|
else
|
||||||
|
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
|
||||||
|
endif
|
||||||
|
ifeq ($(SGEMM_UNROLL_M), 4)
|
||||||
|
SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S
|
||||||
|
else
|
||||||
|
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
|
||||||
|
endif
|
||||||
|
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
endif
|
||||||
|
|
||||||
|
SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S
|
||||||
|
SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
|
||||||
|
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
|
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||||
|
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||||
|
|
||||||
|
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
|
||||||
|
|
||||||
|
ifeq ($(DGEMM_UNROLL_M), 8)
|
||||||
|
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
|
||||||
|
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
|
||||||
|
else
|
||||||
|
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
|
||||||
|
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
|
||||||
|
endif
|
||||||
|
|
||||||
|
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(DGEMM_UNROLL_N), 4)
|
||||||
|
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
|
||||||
|
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
|
||||||
|
else
|
||||||
|
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
|
||||||
|
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
|
||||||
|
endif
|
||||||
|
|
||||||
|
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
|
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||||
|
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||||
|
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
|
||||||
|
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
|
||||||
|
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
|
||||||
|
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
endif
|
||||||
|
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
|
||||||
|
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
|
||||||
|
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
|
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||||
|
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||||
|
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
|
||||||
|
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
|
||||||
|
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
|
||||||
|
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
endif
|
||||||
|
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||||
|
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||||
|
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
|
@ -98,11 +98,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
add X, X, #128
|
add X, X, #128
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
|
/*
|
||||||
|
* No need to do software prefetches if the vector fits
|
||||||
|
* into L1 cache
|
||||||
|
*/
|
||||||
|
.macro KERNEL_F16_L1CACHE
|
||||||
|
ldp q4, q5, [X]
|
||||||
|
ldp q16, q17, [Y]
|
||||||
|
|
||||||
|
ldp q6, q7, [X, #32]
|
||||||
|
ldp q18, q19, [Y, #32]
|
||||||
|
|
||||||
|
fmla v16.2d, v4.2d, v0.d[0]
|
||||||
|
fmla v17.2d, v5.2d, v0.d[0]
|
||||||
|
|
||||||
|
stp q16, q17, [Y]
|
||||||
|
|
||||||
|
ldp q20, q21, [X, #64]
|
||||||
|
ldp q24, q25, [Y, #64]
|
||||||
|
|
||||||
|
fmla v18.2d, v6.2d, v0.d[0]
|
||||||
|
fmla v19.2d, v7.2d, v0.d[0]
|
||||||
|
|
||||||
|
stp q18, q19, [Y, #32]
|
||||||
|
|
||||||
|
ldp q22, q23, [X, #96]
|
||||||
|
ldp q26, q27, [Y, #96]
|
||||||
|
|
||||||
|
fmla v24.2d, v20.2d, v0.d[0]
|
||||||
|
fmla v25.2d, v21.2d, v0.d[0]
|
||||||
|
|
||||||
|
stp q24, q25, [Y, #64]
|
||||||
|
|
||||||
|
fmla v26.2d, v22.2d, v0.d[0]
|
||||||
|
fmla v27.2d, v23.2d, v0.d[0]
|
||||||
|
|
||||||
|
stp q26, q27, [Y, #96]
|
||||||
|
|
||||||
|
add Y, Y, #128
|
||||||
|
add X, X, #128
|
||||||
|
.endm
|
||||||
|
|
||||||
.macro KERNEL_F32
|
.macro KERNEL_F32
|
||||||
KERNEL_F16
|
KERNEL_F16
|
||||||
KERNEL_F16
|
KERNEL_F16
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
.macro KERNEL_F32_L1CACHE
|
||||||
|
KERNEL_F16_L1CACHE
|
||||||
|
KERNEL_F16_L1CACHE
|
||||||
|
.endm
|
||||||
|
|
||||||
.macro INIT_S
|
.macro INIT_S
|
||||||
lsl INC_X, INC_X, #3
|
lsl INC_X, INC_X, #3
|
||||||
lsl INC_Y, INC_Y, #3
|
lsl INC_Y, INC_Y, #3
|
||||||
|
@ -138,6 +185,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
cmp I, xzr
|
cmp I, xzr
|
||||||
beq .Ldaxpy_kernel_F1
|
beq .Ldaxpy_kernel_F1
|
||||||
|
|
||||||
|
cmp N, #2048
|
||||||
|
ble .Ldaxpy_kernel_F32_L1CACHE
|
||||||
|
|
||||||
.align 5
|
.align 5
|
||||||
.Ldaxpy_kernel_F32:
|
.Ldaxpy_kernel_F32:
|
||||||
|
|
||||||
|
@ -145,6 +195,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
subs I, I, #1
|
subs I, I, #1
|
||||||
bne .Ldaxpy_kernel_F32
|
bne .Ldaxpy_kernel_F32
|
||||||
|
b .Ldaxpy_kernel_F1
|
||||||
|
|
||||||
|
.align 5
|
||||||
|
.Ldaxpy_kernel_F32_L1CACHE:
|
||||||
|
|
||||||
|
KERNEL_F32_L1CACHE
|
||||||
|
|
||||||
|
subs I, I, #1
|
||||||
|
bne .Ldaxpy_kernel_F32_L1CACHE
|
||||||
|
|
||||||
.Ldaxpy_kernel_F1:
|
.Ldaxpy_kernel_F1:
|
||||||
|
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,562 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2016, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
#define ASSEMBLER
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#define M x0
|
||||||
|
#define N x1
|
||||||
|
#define A00 x2
|
||||||
|
#define LDA x3
|
||||||
|
#define B00 x4
|
||||||
|
|
||||||
|
#define A01 x5
|
||||||
|
#define A02 x6
|
||||||
|
#define A03 x7
|
||||||
|
#define A04 x8
|
||||||
|
#define A05 x9
|
||||||
|
#define A06 x10
|
||||||
|
#define A07 x11
|
||||||
|
#define A08 x12
|
||||||
|
|
||||||
|
#define I x13
|
||||||
|
#define J x14
|
||||||
|
#define K x15
|
||||||
|
|
||||||
|
#define TEMP1 x16
|
||||||
|
#define TEMP2 x17
|
||||||
|
|
||||||
|
/**************************************************************************************
|
||||||
|
* Macro definitions
|
||||||
|
**************************************************************************************/
|
||||||
|
|
||||||
|
.macro SAVE_REGS
|
||||||
|
add sp, sp, #-(11 * 16)
|
||||||
|
stp d8, d9, [sp, #(0 * 16)]
|
||||||
|
stp d10, d11, [sp, #(1 * 16)]
|
||||||
|
stp d12, d13, [sp, #(2 * 16)]
|
||||||
|
stp d14, d15, [sp, #(3 * 16)]
|
||||||
|
stp d16, d17, [sp, #(4 * 16)]
|
||||||
|
stp x18, x19, [sp, #(5 * 16)]
|
||||||
|
stp x20, x21, [sp, #(6 * 16)]
|
||||||
|
stp x22, x23, [sp, #(7 * 16)]
|
||||||
|
stp x24, x25, [sp, #(8 * 16)]
|
||||||
|
stp x26, x27, [sp, #(9 * 16)]
|
||||||
|
str x28, [sp, #(10 * 16)]
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro RESTORE_REGS
|
||||||
|
ldp d8, d9, [sp, #(0 * 16)]
|
||||||
|
ldp d10, d11, [sp, #(1 * 16)]
|
||||||
|
ldp d12, d13, [sp, #(2 * 16)]
|
||||||
|
ldp d14, d15, [sp, #(3 * 16)]
|
||||||
|
ldp d16, d17, [sp, #(4 * 16)]
|
||||||
|
ldp x18, x19, [sp, #(5 * 16)]
|
||||||
|
ldp x20, x21, [sp, #(6 * 16)]
|
||||||
|
ldp x22, x23, [sp, #(7 * 16)]
|
||||||
|
ldp x24, x25, [sp, #(8 * 16)]
|
||||||
|
ldp x26, x27, [sp, #(9 * 16)]
|
||||||
|
ldr x28, [sp, #(10 * 16)]
|
||||||
|
add sp, sp, #(11*16)
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro COPY4x8
|
||||||
|
ldr q0, [A01], #16
|
||||||
|
ldr q1, [A02], #16
|
||||||
|
ins v8.s[0], v0.s[0]
|
||||||
|
ins v10.s[0], v0.s[1]
|
||||||
|
ins v12.s[0], v0.s[2]
|
||||||
|
ins v14.s[0], v0.s[3]
|
||||||
|
ins v8.s[1], v1.s[0]
|
||||||
|
ins v10.s[1], v1.s[1]
|
||||||
|
ins v12.s[1], v1.s[2]
|
||||||
|
ins v14.s[1], v1.s[3]
|
||||||
|
|
||||||
|
ldr q2, [A03], #16
|
||||||
|
ldr q3, [A04], #16
|
||||||
|
ins v8.s[2], v2.s[0]
|
||||||
|
ins v10.s[2], v2.s[1]
|
||||||
|
ins v12.s[2], v2.s[2]
|
||||||
|
ins v14.s[2], v2.s[3]
|
||||||
|
ins v8.s[3], v3.s[0]
|
||||||
|
ins v10.s[3], v3.s[1]
|
||||||
|
ins v12.s[3], v3.s[2]
|
||||||
|
ins v14.s[3], v3.s[3]
|
||||||
|
|
||||||
|
ldr q4, [A05], #16
|
||||||
|
ldr q5, [A06], #16
|
||||||
|
ins v9.s[0], v4.s[0]
|
||||||
|
ins v11.s[0], v4.s[1]
|
||||||
|
ins v13.s[0], v4.s[2]
|
||||||
|
ins v15.s[0], v4.s[3]
|
||||||
|
ins v9.s[1], v5.s[0]
|
||||||
|
ins v11.s[1], v5.s[1]
|
||||||
|
ins v13.s[1], v5.s[2]
|
||||||
|
ins v15.s[1], v5.s[3]
|
||||||
|
|
||||||
|
ldr q6, [A07], #16
|
||||||
|
ldr q7, [A08], #16
|
||||||
|
ins v9.s[2], v6.s[0]
|
||||||
|
ins v11.s[2], v6.s[1]
|
||||||
|
ins v13.s[2], v6.s[2]
|
||||||
|
ins v15.s[2], v6.s[3]
|
||||||
|
ins v9.s[3], v7.s[0]
|
||||||
|
ins v11.s[3], v7.s[1]
|
||||||
|
ins v13.s[3], v7.s[2]
|
||||||
|
ins v15.s[3], v7.s[3]
|
||||||
|
|
||||||
|
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00], #64
|
||||||
|
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [B00], #64
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro COPY2x8
|
||||||
|
ldr d0, [A01], #8
|
||||||
|
ldr d1, [A02], #8
|
||||||
|
ins v8.s[0], v0.s[0]
|
||||||
|
ins v10.s[0], v0.s[1]
|
||||||
|
ins v8.s[1], v1.s[0]
|
||||||
|
ins v10.s[1], v1.s[1]
|
||||||
|
|
||||||
|
ldr d2, [A03], #8
|
||||||
|
ldr d3, [A04], #8
|
||||||
|
ins v8.s[2], v2.s[0]
|
||||||
|
ins v10.s[2], v2.s[1]
|
||||||
|
ins v8.s[3], v3.s[0]
|
||||||
|
ins v10.s[3], v3.s[1]
|
||||||
|
|
||||||
|
ldr d4, [A05], #8
|
||||||
|
ldr d5, [A06], #8
|
||||||
|
ins v9.s[0], v4.s[0]
|
||||||
|
ins v11.s[0], v4.s[1]
|
||||||
|
ins v9.s[1], v5.s[0]
|
||||||
|
ins v11.s[1], v5.s[1]
|
||||||
|
|
||||||
|
ldr d6, [A07], #8
|
||||||
|
ldr d7, [A08], #8
|
||||||
|
ins v9.s[2], v6.s[0]
|
||||||
|
ins v11.s[2], v6.s[1]
|
||||||
|
ins v9.s[3], v7.s[0]
|
||||||
|
ins v11.s[3], v7.s[1]
|
||||||
|
|
||||||
|
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00], #64
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro COPY1x8
|
||||||
|
ldr s0, [A01], #4
|
||||||
|
ldr s1, [A02], #4
|
||||||
|
ins v8.s[0], v0.s[0]
|
||||||
|
ins v8.s[1], v1.s[0]
|
||||||
|
|
||||||
|
ldr s2, [A03], #4
|
||||||
|
ldr s3, [A04], #4
|
||||||
|
ins v8.s[2], v2.s[0]
|
||||||
|
ins v8.s[3], v3.s[0]
|
||||||
|
|
||||||
|
ldr s4, [A05], #4
|
||||||
|
ldr s5, [A06], #4
|
||||||
|
ins v9.s[0], v4.s[0]
|
||||||
|
ins v9.s[1], v5.s[0]
|
||||||
|
|
||||||
|
ldr s6, [A07], #4
|
||||||
|
ldr s7, [A08], #4
|
||||||
|
ins v9.s[2], v6.s[0]
|
||||||
|
ins v9.s[3], v7.s[0]
|
||||||
|
|
||||||
|
st1 {v8.4s, v9.4s}, [B00], #32
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro COPY4x4
|
||||||
|
ldr q0, [A01], #16
|
||||||
|
ldr q1, [A02], #16
|
||||||
|
ins v8.s[0], v0.s[0]
|
||||||
|
ins v9.s[0], v0.s[1]
|
||||||
|
ins v10.s[0], v0.s[2]
|
||||||
|
ins v11.s[0], v0.s[3]
|
||||||
|
ins v8.s[1], v1.s[0]
|
||||||
|
ins v9.s[1], v1.s[1]
|
||||||
|
ins v10.s[1], v1.s[2]
|
||||||
|
ins v11.s[1], v1.s[3]
|
||||||
|
|
||||||
|
ldr q2, [A03], #16
|
||||||
|
ldr q3, [A04], #16
|
||||||
|
ins v8.s[2], v2.s[0]
|
||||||
|
ins v9.s[2], v2.s[1]
|
||||||
|
ins v10.s[2], v2.s[2]
|
||||||
|
ins v11.s[2], v2.s[3]
|
||||||
|
ins v8.s[3], v3.s[0]
|
||||||
|
ins v9.s[3], v3.s[1]
|
||||||
|
ins v10.s[3], v3.s[2]
|
||||||
|
ins v11.s[3], v3.s[3]
|
||||||
|
|
||||||
|
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00], #64
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro COPY2x4
|
||||||
|
ldr d0, [A01], #8
|
||||||
|
ldr d1, [A02], #8
|
||||||
|
ins v8.s[0], v0.s[0]
|
||||||
|
ins v9.s[0], v0.s[1]
|
||||||
|
ins v8.s[1], v1.s[0]
|
||||||
|
ins v9.s[1], v1.s[1]
|
||||||
|
|
||||||
|
ldr d2, [A03], #8
|
||||||
|
ldr d3, [A04], #8
|
||||||
|
ins v8.s[2], v2.s[0]
|
||||||
|
ins v9.s[2], v2.s[1]
|
||||||
|
ins v8.s[3], v3.s[0]
|
||||||
|
ins v9.s[3], v3.s[1]
|
||||||
|
|
||||||
|
st1 {v8.4s, v9.4s}, [B00], #32
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro COPY1x4
|
||||||
|
ldr s0, [A01], #4
|
||||||
|
ldr s1, [A02], #4
|
||||||
|
ins v8.s[0], v0.s[0]
|
||||||
|
ins v8.s[1], v1.s[0]
|
||||||
|
|
||||||
|
ldr s2, [A03], #4
|
||||||
|
ldr s3, [A04], #4
|
||||||
|
ins v8.s[2], v2.s[0]
|
||||||
|
ins v8.s[3], v3.s[0]
|
||||||
|
|
||||||
|
st1 {v8.4s}, [B00], #16
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro COPY4x2
|
||||||
|
ldr q0, [A01], #16
|
||||||
|
ldr q1, [A02], #16
|
||||||
|
ins v8.s[0], v0.s[0]
|
||||||
|
ins v9.s[0], v0.s[1]
|
||||||
|
ins v10.s[0], v0.s[2]
|
||||||
|
ins v11.s[0], v0.s[3]
|
||||||
|
ins v8.s[1], v1.s[0]
|
||||||
|
ins v9.s[1], v1.s[1]
|
||||||
|
ins v10.s[1], v1.s[2]
|
||||||
|
ins v11.s[1], v1.s[3]
|
||||||
|
|
||||||
|
st1 {v8.2s, v9.2s, v10.2s, v11.2s}, [B00], #32
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro COPY2x2
|
||||||
|
ldr d0, [A01], #8
|
||||||
|
ldr d1, [A02], #8
|
||||||
|
ins v8.s[0], v0.s[0]
|
||||||
|
ins v9.s[0], v0.s[1]
|
||||||
|
ins v8.s[1], v1.s[0]
|
||||||
|
ins v9.s[1], v1.s[1]
|
||||||
|
|
||||||
|
st1 {v8.2s, v9.2s}, [B00], #16
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro COPY1x2
|
||||||
|
ldr s0, [A01], #4
|
||||||
|
ldr s1, [A02], #4
|
||||||
|
ins v8.s[0], v0.s[0]
|
||||||
|
ins v8.s[1], v1.s[0]
|
||||||
|
|
||||||
|
st1 {v8.2s}, [B00], #8
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro COPY1x1
|
||||||
|
ldr s0, [A01], #4
|
||||||
|
str s0, [B00], #4
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/**************************************************************************************
|
||||||
|
* End of macro definitions
|
||||||
|
**************************************************************************************/
|
||||||
|
|
||||||
|
PROLOGUE
|
||||||
|
|
||||||
|
.align 5
|
||||||
|
|
||||||
|
SAVE_REGS
|
||||||
|
|
||||||
|
lsl LDA, LDA, #2 // LDA = LDA * SIZE
|
||||||
|
|
||||||
|
.Lsgemm_ncopy_L8_BEGIN:
|
||||||
|
|
||||||
|
asr J, N, #3 // J = N / 8
|
||||||
|
cmp J, #0
|
||||||
|
ble .Lsgemm_ncopy_L4_BEGIN
|
||||||
|
|
||||||
|
.align 5
|
||||||
|
.Lsgemm_ncopy_L8_M4_BEGIN:
|
||||||
|
|
||||||
|
mov A01, A00
|
||||||
|
add A02, A01, LDA
|
||||||
|
add A03, A02, LDA
|
||||||
|
add A04, A03, LDA
|
||||||
|
add A05, A04, LDA
|
||||||
|
add A06, A05, LDA
|
||||||
|
add A07, A06, LDA
|
||||||
|
add A08, A07, LDA
|
||||||
|
add A00, A08, LDA
|
||||||
|
|
||||||
|
asr I, M, #2 // I = M / 4
|
||||||
|
cmp I, #0
|
||||||
|
ble .Lsgemm_ncopy_L8_M4_40
|
||||||
|
|
||||||
|
asr K, M, #4 // K = M / 16(cacheline)
|
||||||
|
mov TEMP1, A01
|
||||||
|
|
||||||
|
.align 5
|
||||||
|
.Lsgemm_tcopy_L8_warnup_1:
|
||||||
|
|
||||||
|
ldr s0, [TEMP1], #64
|
||||||
|
|
||||||
|
subs K, K, #1
|
||||||
|
bgt .Lsgemm_tcopy_L8_warnup_1
|
||||||
|
|
||||||
|
asr K, M, #4 // K = M / 16(cacheline)
|
||||||
|
mov TEMP1, A02
|
||||||
|
|
||||||
|
.align 5
|
||||||
|
.Lsgemm_tcopy_L8_warnup_2:
|
||||||
|
|
||||||
|
ldr s0, [TEMP1], #64
|
||||||
|
|
||||||
|
subs K, K, #1
|
||||||
|
bgt .Lsgemm_tcopy_L8_warnup_2
|
||||||
|
|
||||||
|
asr K, M, #4 // K = M / 16(cacheline)
|
||||||
|
mov TEMP1, A03
|
||||||
|
|
||||||
|
.align 5
|
||||||
|
.Lsgemm_tcopy_L8_warnup_3:
|
||||||
|
|
||||||
|
ldr s0, [TEMP1], #64
|
||||||
|
|
||||||
|
subs K, K, #1
|
||||||
|
bgt .Lsgemm_tcopy_L8_warnup_3
|
||||||
|
|
||||||
|
asr K, M, #4 // K = M / 16(cacheline)
|
||||||
|
mov TEMP1, A04
|
||||||
|
|
||||||
|
.align 5
|
||||||
|
.Lsgemm_tcopy_L8_warnup_4:
|
||||||
|
|
||||||
|
ldr s0, [TEMP1], #64
|
||||||
|
|
||||||
|
subs K, K, #1
|
||||||
|
bgt .Lsgemm_tcopy_L8_warnup_4
|
||||||
|
|
||||||
|
asr K, M, #4 // K = M / 16(cacheline)
|
||||||
|
mov TEMP1, A05
|
||||||
|
|
||||||
|
.align 5
|
||||||
|
.Lsgemm_tcopy_L8_warnup_5:
|
||||||
|
|
||||||
|
ldr s0, [TEMP1], #64
|
||||||
|
|
||||||
|
subs K, K, #1
|
||||||
|
bgt .Lsgemm_tcopy_L8_warnup_5
|
||||||
|
|
||||||
|
asr K, M, #4 // K = M / 16(cacheline)
|
||||||
|
mov TEMP1, A06
|
||||||
|
|
||||||
|
.align 5
|
||||||
|
.Lsgemm_tcopy_L8_warnup_6:
|
||||||
|
|
||||||
|
ldr s0, [TEMP1], #64
|
||||||
|
|
||||||
|
subs K, K, #1
|
||||||
|
bgt .Lsgemm_tcopy_L8_warnup_6
|
||||||
|
|
||||||
|
asr K, M, #4 // K = M / 16(cacheline)
|
||||||
|
mov TEMP1, A07
|
||||||
|
|
||||||
|
.align 5
|
||||||
|
.Lsgemm_tcopy_L8_warnup_7:
|
||||||
|
|
||||||
|
ldr s0, [TEMP1], #64
|
||||||
|
|
||||||
|
subs K, K, #1
|
||||||
|
bgt .Lsgemm_tcopy_L8_warnup_7
|
||||||
|
|
||||||
|
asr K, M, #4 // K = M / 16(cacheline)
|
||||||
|
mov TEMP1, A08
|
||||||
|
|
||||||
|
.align 5
|
||||||
|
.Lsgemm_tcopy_L8_warnup_8:
|
||||||
|
|
||||||
|
ldr s0, [TEMP1], #64
|
||||||
|
|
||||||
|
subs K, K, #1
|
||||||
|
bgt .Lsgemm_tcopy_L8_warnup_8
|
||||||
|
|
||||||
|
.align 5
|
||||||
|
.Lsgemm_ncopy_L8_M4_20:
|
||||||
|
|
||||||
|
COPY4x8
|
||||||
|
|
||||||
|
subs I, I, #1
|
||||||
|
bne .Lsgemm_ncopy_L8_M4_20
|
||||||
|
|
||||||
|
.Lsgemm_ncopy_L8_M4_40:
|
||||||
|
|
||||||
|
and I, M, #2
|
||||||
|
cmp I, #0
|
||||||
|
ble .Lsgemm_ncopy_L8_M4_60
|
||||||
|
|
||||||
|
COPY2x8
|
||||||
|
|
||||||
|
.Lsgemm_ncopy_L8_M4_60:
|
||||||
|
|
||||||
|
and I, M, #1
|
||||||
|
cmp I, #0
|
||||||
|
ble .Lsgemm_ncopy_L8_M4_END
|
||||||
|
|
||||||
|
COPY1x8
|
||||||
|
|
||||||
|
.Lsgemm_ncopy_L8_M4_END:
|
||||||
|
|
||||||
|
subs J , J, #1 // j--
|
||||||
|
bne .Lsgemm_ncopy_L8_M4_BEGIN
|
||||||
|
|
||||||
|
/*********************************************************************************************/
|
||||||
|
|
||||||
|
.Lsgemm_ncopy_L4_BEGIN:
|
||||||
|
|
||||||
|
tst N, #7
|
||||||
|
ble .Lsgemm_ncopy_L999
|
||||||
|
|
||||||
|
tst N, #4
|
||||||
|
ble .Lsgemm_ncopy_L2_BEGIN
|
||||||
|
|
||||||
|
.Lsgemm_ncopy_L4_M4_BEGIN:
|
||||||
|
mov A01, A00
|
||||||
|
add A02, A01, LDA
|
||||||
|
add A03, A02, LDA
|
||||||
|
add A04, A03, LDA
|
||||||
|
add A00, A04, LDA
|
||||||
|
|
||||||
|
asr I, M, #2 // I = M / 4
|
||||||
|
cmp I, #0
|
||||||
|
ble .Lsgemm_ncopy_L4_M4_40
|
||||||
|
|
||||||
|
.align 5
|
||||||
|
.Lsgemm_ncopy_L4_M4_20:
|
||||||
|
|
||||||
|
COPY4x4
|
||||||
|
|
||||||
|
subs I, I, #1
|
||||||
|
bne .Lsgemm_ncopy_L4_M4_20
|
||||||
|
|
||||||
|
.Lsgemm_ncopy_L4_M4_40:
|
||||||
|
|
||||||
|
and I, M, #2
|
||||||
|
cmp I, #0
|
||||||
|
ble .Lsgemm_ncopy_L4_M4_60
|
||||||
|
|
||||||
|
COPY2x4
|
||||||
|
|
||||||
|
.Lsgemm_ncopy_L4_M4_60:
|
||||||
|
|
||||||
|
and I, M, #1
|
||||||
|
cmp I, #0
|
||||||
|
ble .Lsgemm_ncopy_L4_M4_END
|
||||||
|
|
||||||
|
COPY1x4
|
||||||
|
|
||||||
|
.Lsgemm_ncopy_L4_M4_END:
|
||||||
|
|
||||||
|
|
||||||
|
/*********************************************************************************************/
|
||||||
|
|
||||||
|
.Lsgemm_ncopy_L2_BEGIN:
|
||||||
|
|
||||||
|
tst N, #2
|
||||||
|
ble .Lsgemm_ncopy_L1_BEGIN
|
||||||
|
|
||||||
|
.Lsgemm_ncopy_L2_M4_BEGIN:
|
||||||
|
|
||||||
|
mov A01, A00
|
||||||
|
add A02, A01, LDA
|
||||||
|
add A00, A02, LDA
|
||||||
|
|
||||||
|
asr I, M, #2 // I = M / 4
|
||||||
|
cmp I, #0
|
||||||
|
ble .Lsgemm_ncopy_L2_M4_40
|
||||||
|
|
||||||
|
.align 5
|
||||||
|
.Lsgemm_ncopy_L2_M4_20:
|
||||||
|
|
||||||
|
COPY4x2
|
||||||
|
|
||||||
|
subs I , I , #1
|
||||||
|
bne .Lsgemm_ncopy_L2_M4_20
|
||||||
|
|
||||||
|
|
||||||
|
.Lsgemm_ncopy_L2_M4_40:
|
||||||
|
|
||||||
|
and I, M, #2
|
||||||
|
cmp I, #0
|
||||||
|
ble .Lsgemm_ncopy_L2_M4_60
|
||||||
|
|
||||||
|
COPY2x2
|
||||||
|
|
||||||
|
.Lsgemm_ncopy_L2_M4_60:
|
||||||
|
|
||||||
|
and I, M, #1
|
||||||
|
cmp I, #0
|
||||||
|
ble .Lsgemm_ncopy_L2_M4_END
|
||||||
|
|
||||||
|
COPY1x2
|
||||||
|
|
||||||
|
.Lsgemm_ncopy_L2_M4_END:
|
||||||
|
|
||||||
|
.Lsgemm_ncopy_L1_BEGIN:
|
||||||
|
|
||||||
|
tst N, #1
|
||||||
|
ble .Lsgemm_ncopy_L999
|
||||||
|
|
||||||
|
.Lsgemm_ncopy_L1_M1_BEGIN:
|
||||||
|
|
||||||
|
mov A01, A00
|
||||||
|
|
||||||
|
mov I, M
|
||||||
|
cmp I, #0
|
||||||
|
ble .Lsgemm_ncopy_L1_M1_END
|
||||||
|
|
||||||
|
.align 5
|
||||||
|
.Lsgemm_ncopy_L1_M1_20:
|
||||||
|
|
||||||
|
COPY1x1
|
||||||
|
|
||||||
|
subs I, I, #1
|
||||||
|
bne .Lsgemm_ncopy_L1_M1_20
|
||||||
|
|
||||||
|
.Lsgemm_ncopy_L1_M1_END:
|
||||||
|
|
||||||
|
.Lsgemm_ncopy_L999:
|
||||||
|
|
||||||
|
mov x0, #0
|
||||||
|
RESTORE_REGS
|
||||||
|
ret
|
||||||
|
|
||||||
|
EPILOGUE
|
|
@ -0,0 +1,707 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2016, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
#define ASSEMBLER
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#define M x0
|
||||||
|
#define N x1
|
||||||
|
#define A x2
|
||||||
|
#define LDA x3
|
||||||
|
#define B x4
|
||||||
|
|
||||||
|
#define M8 x5
|
||||||
|
|
||||||
|
#define A01 x6
|
||||||
|
#define A02 x7
|
||||||
|
#define A03 x8
|
||||||
|
#define A04 x9
|
||||||
|
#define A05 x10
|
||||||
|
#define A06 x11
|
||||||
|
#define A07 x12
|
||||||
|
#define A08 x13
|
||||||
|
|
||||||
|
#define B01 x14
|
||||||
|
#define B02 x15
|
||||||
|
#define B03 x16
|
||||||
|
#define B04 x17
|
||||||
|
#define B00 x22
|
||||||
|
|
||||||
|
|
||||||
|
#define I x18
|
||||||
|
#define J x19
|
||||||
|
|
||||||
|
#define TEMP1 x20
|
||||||
|
|
||||||
|
#define A_PREFETCH 256
|
||||||
|
|
||||||
|
/**************************************************************************************
|
||||||
|
* Macro definitions
|
||||||
|
**************************************************************************************/
|
||||||
|
.macro SAVE_REGS
|
||||||
|
add sp, sp, #-(11 * 16)
|
||||||
|
stp d8, d9, [sp, #(0 * 16)]
|
||||||
|
stp d10, d11, [sp, #(1 * 16)]
|
||||||
|
stp d12, d13, [sp, #(2 * 16)]
|
||||||
|
stp d14, d15, [sp, #(3 * 16)]
|
||||||
|
stp d16, d17, [sp, #(4 * 16)]
|
||||||
|
stp x18, x19, [sp, #(5 * 16)]
|
||||||
|
stp x20, x21, [sp, #(6 * 16)]
|
||||||
|
stp x22, x23, [sp, #(7 * 16)]
|
||||||
|
stp x24, x25, [sp, #(8 * 16)]
|
||||||
|
stp x26, x27, [sp, #(9 * 16)]
|
||||||
|
str x28, [sp, #(10 * 16)]
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro RESTORE_REGS
|
||||||
|
ldp d8, d9, [sp, #(0 * 16)]
|
||||||
|
ldp d10, d11, [sp, #(1 * 16)]
|
||||||
|
ldp d12, d13, [sp, #(2 * 16)]
|
||||||
|
ldp d14, d15, [sp, #(3 * 16)]
|
||||||
|
ldp d16, d17, [sp, #(4 * 16)]
|
||||||
|
ldp x18, x19, [sp, #(5 * 16)]
|
||||||
|
ldp x20, x21, [sp, #(6 * 16)]
|
||||||
|
ldp x22, x23, [sp, #(7 * 16)]
|
||||||
|
ldp x24, x25, [sp, #(8 * 16)]
|
||||||
|
ldp x26, x27, [sp, #(9 * 16)]
|
||||||
|
ldr x28, [sp, #(10 * 16)]
|
||||||
|
add sp, sp, #(11*16)
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/*************************************************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY8x8
|
||||||
|
prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||||
|
prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||||
|
prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||||
|
prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||||
|
prfm PLDL1KEEP, [A05, #A_PREFETCH]
|
||||||
|
prfm PLDL1KEEP, [A06, #A_PREFETCH]
|
||||||
|
prfm PLDL1KEEP, [A07, #A_PREFETCH]
|
||||||
|
prfm PLDL1KEEP, [A08, #A_PREFETCH]
|
||||||
|
|
||||||
|
ldp q0, q1, [A01]
|
||||||
|
ldp q2, q3, [A02]
|
||||||
|
add A01, A01, #32
|
||||||
|
add A02, A02, #32
|
||||||
|
|
||||||
|
st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
|
||||||
|
add TEMP1, B00, #64
|
||||||
|
|
||||||
|
ldp q4, q5, [A03]
|
||||||
|
ldp q6, q7, [A04]
|
||||||
|
add A03, A03, #32
|
||||||
|
add A04, A04, #32
|
||||||
|
|
||||||
|
st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1]
|
||||||
|
add TEMP1, TEMP1, #64
|
||||||
|
|
||||||
|
ldp q8, q9, [A05]
|
||||||
|
ldp q10, q11, [A06]
|
||||||
|
add A05, A05, #32
|
||||||
|
add A06, A06, #32
|
||||||
|
|
||||||
|
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1]
|
||||||
|
add TEMP1, TEMP1, #64
|
||||||
|
|
||||||
|
ldp q12, q13, [A07]
|
||||||
|
ldp q14, q15, [A08]
|
||||||
|
add A07, A07, #32
|
||||||
|
add A08, A08, #32
|
||||||
|
|
||||||
|
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1]
|
||||||
|
add TEMP1, TEMP1, #64
|
||||||
|
|
||||||
|
add B00, B00, M8
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro COPY4x8
|
||||||
|
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||||
|
//prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||||
|
//prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||||
|
//prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||||
|
//prfm PLDL1KEEP, [A05, #A_PREFETCH]
|
||||||
|
//prfm PLDL1KEEP, [A06, #A_PREFETCH]
|
||||||
|
//prfm PLDL1KEEP, [A07, #A_PREFETCH]
|
||||||
|
//prfm PLDL1KEEP, [A08, #A_PREFETCH]
|
||||||
|
|
||||||
|
ldr q0, [A01]
|
||||||
|
ldr q1, [A02]
|
||||||
|
ldr q2, [A03]
|
||||||
|
ldr q3, [A04]
|
||||||
|
add A01, A01, #16
|
||||||
|
add A02, A02, #16
|
||||||
|
add A03, A03, #16
|
||||||
|
add A04, A04, #16
|
||||||
|
|
||||||
|
st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01]
|
||||||
|
add B01, B01, #64
|
||||||
|
|
||||||
|
ldr q4, [A05]
|
||||||
|
ldr q5, [A06]
|
||||||
|
ldr q6, [A07]
|
||||||
|
ldr q7, [A08]
|
||||||
|
|
||||||
|
add A05, A05, #16
|
||||||
|
add A06, A06, #16
|
||||||
|
add A07, A07, #16
|
||||||
|
add A08, A08, #16
|
||||||
|
|
||||||
|
st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [B01]
|
||||||
|
add B01, B01, #64
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro COPY2x8
|
||||||
|
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||||
|
//prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||||
|
//prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||||
|
//prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||||
|
//prfm PLDL1KEEP, [A05, #A_PREFETCH]
|
||||||
|
//prfm PLDL1KEEP, [A06, #A_PREFETCH]
|
||||||
|
//prfm PLDL1KEEP, [A07, #A_PREFETCH]
|
||||||
|
//prfm PLDL1KEEP, [A08, #A_PREFETCH]
|
||||||
|
|
||||||
|
ldr d0, [A01]
|
||||||
|
ldr d1, [A02]
|
||||||
|
ldr d2, [A03]
|
||||||
|
ldr d3, [A04]
|
||||||
|
|
||||||
|
add A01, A01, #8
|
||||||
|
add A02, A02, #8
|
||||||
|
add A03, A03, #8
|
||||||
|
add A04, A04, #8
|
||||||
|
|
||||||
|
stp d0, d1, [B02]
|
||||||
|
add B02, B02, #16
|
||||||
|
stp d2, d3, [B02]
|
||||||
|
add B02, B02, #16
|
||||||
|
|
||||||
|
ldr d4, [A05]
|
||||||
|
ldr d5, [A06]
|
||||||
|
ldr d6, [A07]
|
||||||
|
ldr d7, [A08]
|
||||||
|
|
||||||
|
add A05, A05, #8
|
||||||
|
add A06, A06, #8
|
||||||
|
add A07, A07, #8
|
||||||
|
add A08, A08, #8
|
||||||
|
|
||||||
|
stp d4, d5, [B02]
|
||||||
|
add B02, B02, #16
|
||||||
|
stp d6, d7, [B02]
|
||||||
|
add B02, B02, #16
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro COPY1x8
|
||||||
|
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||||
|
//prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||||
|
//prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||||
|
//prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||||
|
//prfm PLDL1KEEP, [A05, #A_PREFETCH]
|
||||||
|
//prfm PLDL1KEEP, [A06, #A_PREFETCH]
|
||||||
|
//prfm PLDL1KEEP, [A07, #A_PREFETCH]
|
||||||
|
//prfm PLDL1KEEP, [A08, #A_PREFETCH]
|
||||||
|
|
||||||
|
ldr s0, [A01]
|
||||||
|
ldr s1, [A02]
|
||||||
|
ldr s2, [A03]
|
||||||
|
ldr s3, [A04]
|
||||||
|
|
||||||
|
add A01, A01, #4
|
||||||
|
add A02, A02, #4
|
||||||
|
add A03, A03, #4
|
||||||
|
add A04, A04, #4
|
||||||
|
|
||||||
|
stp s0, s1, [B03]
|
||||||
|
add B03, B03, #8
|
||||||
|
stp s2, s3, [B03]
|
||||||
|
add B03, B03, #8
|
||||||
|
|
||||||
|
ldr s4, [A05]
|
||||||
|
ldr s5, [A06]
|
||||||
|
ldr s6, [A07]
|
||||||
|
ldr s7, [A08]
|
||||||
|
|
||||||
|
ldr d4, [A05], #8
|
||||||
|
ldr d5, [A06], #8
|
||||||
|
ldr d6, [A07], #8
|
||||||
|
ldr d7, [A08], #8
|
||||||
|
|
||||||
|
stp s4, s5, [B03]
|
||||||
|
add B03, B03, #8
|
||||||
|
stp s6, s7, [B03]
|
||||||
|
add B03, B03, #8
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/*************************************************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY8x4
|
||||||
|
prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||||
|
prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||||
|
prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||||
|
prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||||
|
|
||||||
|
ldp q0, q1, [A01]
|
||||||
|
ldp q2, q3, [A02]
|
||||||
|
add A01, A01, #32
|
||||||
|
add A02, A02, #32
|
||||||
|
|
||||||
|
st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
|
||||||
|
add TEMP1, B00, #64
|
||||||
|
|
||||||
|
ldp q4, q5, [A03]
|
||||||
|
ldp q6, q7, [A04]
|
||||||
|
add A03, A03, #32
|
||||||
|
add A04, A04, #32
|
||||||
|
|
||||||
|
st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1]
|
||||||
|
add TEMP1, TEMP1, #64
|
||||||
|
|
||||||
|
add B00, B00, M8
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro COPY4x4
|
||||||
|
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||||
|
//prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||||
|
//prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||||
|
//prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||||
|
|
||||||
|
ldr q0, [A01]
|
||||||
|
ldr q1, [A02]
|
||||||
|
ldr q2, [A03]
|
||||||
|
ldr q3, [A04]
|
||||||
|
add A01, A01, #16
|
||||||
|
add A02, A02, #16
|
||||||
|
add A03, A03, #16
|
||||||
|
add A04, A04, #16
|
||||||
|
|
||||||
|
st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01]
|
||||||
|
|
||||||
|
add B01, B01, #64
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro COPY2x4
|
||||||
|
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||||
|
//prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||||
|
//prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||||
|
//prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||||
|
|
||||||
|
ldr d0, [A01]
|
||||||
|
ldr d1, [A02]
|
||||||
|
ldr d2, [A03]
|
||||||
|
ldr d3, [A04]
|
||||||
|
|
||||||
|
add A01, A01, #8
|
||||||
|
add A02, A02, #8
|
||||||
|
add A03, A03, #8
|
||||||
|
add A04, A04, #8
|
||||||
|
|
||||||
|
stp d0, d1, [B02]
|
||||||
|
add B02, B02, #16
|
||||||
|
stp d2, d3, [B02]
|
||||||
|
|
||||||
|
add B02, B02, #16
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro COPY1x4
|
||||||
|
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||||
|
//prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||||
|
//prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||||
|
//prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||||
|
|
||||||
|
ldr s0, [A01]
|
||||||
|
ldr s1, [A02]
|
||||||
|
ldr s2, [A03]
|
||||||
|
ldr s3, [A04]
|
||||||
|
|
||||||
|
add A01, A01, #4
|
||||||
|
add A02, A02, #4
|
||||||
|
add A03, A03, #4
|
||||||
|
add A04, A04, #4
|
||||||
|
|
||||||
|
stp s0, s1, [B03]
|
||||||
|
add B03, B03, #8
|
||||||
|
stp s2, s3, [B03]
|
||||||
|
add B03, B03, #8
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/*************************************************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY8x2
|
||||||
|
prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||||
|
prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||||
|
|
||||||
|
ld1 {v0.4s, v1.4s}, [A01]
|
||||||
|
ld1 {v2.4s, v3.4s}, [A02]
|
||||||
|
add A01, A01, #32
|
||||||
|
add A02, A02, #32
|
||||||
|
|
||||||
|
st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
|
||||||
|
add B00, B00, M8
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro COPY4x2
|
||||||
|
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||||
|
//prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||||
|
|
||||||
|
ldr q0, [A01]
|
||||||
|
ldr q1, [A02]
|
||||||
|
add A01, A01, #16
|
||||||
|
add A02, A02, #16
|
||||||
|
|
||||||
|
stp q0, q1, [B01]
|
||||||
|
add B01, B01, #32
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro COPY2x2
|
||||||
|
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||||
|
//prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||||
|
|
||||||
|
ldr d0, [A01]
|
||||||
|
ldr d1, [A02]
|
||||||
|
|
||||||
|
add A01, A01, #8
|
||||||
|
add A02, A02, #8
|
||||||
|
|
||||||
|
stp d0, d1, [B02]
|
||||||
|
add B02, B02, #16
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro COPY1x2
|
||||||
|
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||||
|
//prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||||
|
|
||||||
|
ldr s0, [A01]
|
||||||
|
ldr s1, [A02]
|
||||||
|
|
||||||
|
add A01, A01, #4
|
||||||
|
add A02, A02, #4
|
||||||
|
|
||||||
|
stp s0, s1, [B03]
|
||||||
|
|
||||||
|
add B03, B03, #8
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/*************************************************************************************************************************/
|
||||||
|
|
||||||
|
.macro COPY8x1
|
||||||
|
prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||||
|
|
||||||
|
ldp q0, q1, [A01]
|
||||||
|
add A01, A01, #32
|
||||||
|
stp q0, q1, [B00]
|
||||||
|
|
||||||
|
add B00, B00, M8
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro COPY4x1
|
||||||
|
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||||
|
|
||||||
|
ldr q0, [A01]
|
||||||
|
add A01, A01, #16
|
||||||
|
str q0, [B01]
|
||||||
|
|
||||||
|
add B01, B01, #16
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro COPY2x1
|
||||||
|
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||||
|
|
||||||
|
ldr d0, [A01]
|
||||||
|
add A01, A01, #8
|
||||||
|
str d0, [B02]
|
||||||
|
|
||||||
|
add B02, B02, #8
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro COPY1x1
|
||||||
|
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||||
|
|
||||||
|
ldr s0, [A01]
|
||||||
|
add A01, A01, #4
|
||||||
|
str s0, [B03]
|
||||||
|
|
||||||
|
add B03, B03, #4
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/**************************************************************************************
|
||||||
|
* End of macro definitions
|
||||||
|
**************************************************************************************/
|
||||||
|
|
||||||
|
PROLOGUE
|
||||||
|
|
||||||
|
.align 5
|
||||||
|
|
||||||
|
SAVE_REGS
|
||||||
|
|
||||||
|
lsl LDA, LDA, #2 // LDA = LDA * SIZE
|
||||||
|
|
||||||
|
lsl TEMP1, M, #2 // TEMP1 = M * SIZE
|
||||||
|
|
||||||
|
and B01 , N , #-8
|
||||||
|
and B02 , N , #-4
|
||||||
|
and B03 , N , #-2
|
||||||
|
|
||||||
|
mul B01, B01, TEMP1
|
||||||
|
mul B02, B02, TEMP1
|
||||||
|
mul B03, B03, TEMP1
|
||||||
|
|
||||||
|
add B01 , B01, B
|
||||||
|
add B02 , B02, B
|
||||||
|
add B03 , B03, B
|
||||||
|
|
||||||
|
lsl M8, M, #5 // M8 = M * 8 * SIZE
|
||||||
|
|
||||||
|
.Lsgemm_tcopy_L8_BEGIN:
|
||||||
|
|
||||||
|
asr J, M, #3 // J = M / 8
|
||||||
|
cmp J, #0
|
||||||
|
ble .Lsgemm_tcopy_L4_BEGIN
|
||||||
|
|
||||||
|
.align 5
|
||||||
|
.Lsgemm_tcopy_L8_M8_BEGIN:
|
||||||
|
|
||||||
|
mov A01, A
|
||||||
|
add A02, A01, LDA
|
||||||
|
add A03, A02, LDA
|
||||||
|
add A04, A03, LDA
|
||||||
|
add A05, A04, LDA
|
||||||
|
add A06, A05, LDA
|
||||||
|
add A07, A06, LDA
|
||||||
|
add A08, A07, LDA
|
||||||
|
add A, A08, LDA
|
||||||
|
|
||||||
|
mov B00, B
|
||||||
|
add B, B00, #256 // B = B + 8 * 8 * SIZE
|
||||||
|
|
||||||
|
asr I, N, #3 // I = N / 8
|
||||||
|
cmp I, #0
|
||||||
|
ble .Lsgemm_tcopy_L8_M8_40
|
||||||
|
|
||||||
|
.align 5
|
||||||
|
.Lsgemm_tcopy_L8_M8_20:
|
||||||
|
|
||||||
|
COPY8x8
|
||||||
|
|
||||||
|
subs I , I , #1
|
||||||
|
bne .Lsgemm_tcopy_L8_M8_20
|
||||||
|
|
||||||
|
.Lsgemm_tcopy_L8_M8_40:
|
||||||
|
|
||||||
|
tst N , #4
|
||||||
|
ble .Lsgemm_tcopy_L8_M8_60
|
||||||
|
|
||||||
|
COPY4x8
|
||||||
|
|
||||||
|
.Lsgemm_tcopy_L8_M8_60:
|
||||||
|
|
||||||
|
tst N , #2
|
||||||
|
ble .Lsgemm_tcopy_L8_M8_80
|
||||||
|
|
||||||
|
COPY2x8
|
||||||
|
|
||||||
|
.Lsgemm_tcopy_L8_M8_80:
|
||||||
|
|
||||||
|
tst N, #1
|
||||||
|
ble .Lsgemm_tcopy_L8_M8_END
|
||||||
|
|
||||||
|
COPY1x8
|
||||||
|
|
||||||
|
.Lsgemm_tcopy_L8_M8_END:
|
||||||
|
|
||||||
|
subs J, J, #1 // j--
|
||||||
|
bne .Lsgemm_tcopy_L8_M8_BEGIN
|
||||||
|
|
||||||
|
/*********************************************************************************************/
|
||||||
|
|
||||||
|
.Lsgemm_tcopy_L4_BEGIN:
|
||||||
|
|
||||||
|
tst M, #7
|
||||||
|
ble .Lsgemm_tcopy_L999
|
||||||
|
|
||||||
|
tst M, #4
|
||||||
|
ble .Lsgemm_tcopy_L2_BEGIN
|
||||||
|
|
||||||
|
.Lsgemm_tcopy_L4_M8_BEGIN:
|
||||||
|
|
||||||
|
mov A01, A
|
||||||
|
add A02, A01, LDA
|
||||||
|
add A03, A02, LDA
|
||||||
|
add A04, A03, LDA
|
||||||
|
add A, A04, LDA
|
||||||
|
|
||||||
|
mov B00, B
|
||||||
|
add B, B00, #128 // B = B + 4 * 8 * SIZE
|
||||||
|
|
||||||
|
asr I, N, #3 // I = N / 8
|
||||||
|
cmp I, #0
|
||||||
|
ble .Lsgemm_tcopy_L4_M8_40
|
||||||
|
|
||||||
|
.align 5
|
||||||
|
.Lsgemm_tcopy_L4_M8_20:
|
||||||
|
|
||||||
|
COPY8x4
|
||||||
|
|
||||||
|
subs I , I , #1
|
||||||
|
bne .Lsgemm_tcopy_L4_M8_20
|
||||||
|
|
||||||
|
.Lsgemm_tcopy_L4_M8_40:
|
||||||
|
|
||||||
|
tst N , #4
|
||||||
|
ble .Lsgemm_tcopy_L4_M8_60
|
||||||
|
|
||||||
|
COPY4x4
|
||||||
|
|
||||||
|
.Lsgemm_tcopy_L4_M8_60:
|
||||||
|
|
||||||
|
tst N , #2
|
||||||
|
ble .Lsgemm_tcopy_L4_M8_80
|
||||||
|
|
||||||
|
COPY2x4
|
||||||
|
|
||||||
|
.Lsgemm_tcopy_L4_M8_80:
|
||||||
|
|
||||||
|
tst N , #1
|
||||||
|
ble .Lsgemm_tcopy_L4_M8_END
|
||||||
|
|
||||||
|
COPY1x4
|
||||||
|
|
||||||
|
|
||||||
|
.Lsgemm_tcopy_L4_M8_END:
|
||||||
|
|
||||||
|
/*********************************************************************************************/
|
||||||
|
|
||||||
|
.Lsgemm_tcopy_L2_BEGIN:
|
||||||
|
|
||||||
|
tst M, #3
|
||||||
|
ble .Lsgemm_tcopy_L999
|
||||||
|
|
||||||
|
tst M, #2
|
||||||
|
ble .Lsgemm_tcopy_L1_BEGIN
|
||||||
|
|
||||||
|
.Lsgemm_tcopy_L2_M16_BEGIN:
|
||||||
|
|
||||||
|
mov A01, A
|
||||||
|
add A02, A01, LDA
|
||||||
|
add A, A02, LDA
|
||||||
|
|
||||||
|
mov B00, B
|
||||||
|
add B, B00, #64 // B = B + 2 * 8 * SIZE
|
||||||
|
|
||||||
|
asr I, N, #3 // I = N / 8
|
||||||
|
cmp I, #0
|
||||||
|
ble .Lsgemm_tcopy_L2_M8_40
|
||||||
|
|
||||||
|
.align 5
|
||||||
|
.Lsgemm_tcopy_L2_M8_20:
|
||||||
|
|
||||||
|
COPY8x2
|
||||||
|
|
||||||
|
subs I , I , #1
|
||||||
|
bne .Lsgemm_tcopy_L2_M8_20
|
||||||
|
|
||||||
|
.Lsgemm_tcopy_L2_M8_40:
|
||||||
|
|
||||||
|
tst N , #4
|
||||||
|
ble .Lsgemm_tcopy_L2_M8_60
|
||||||
|
|
||||||
|
COPY4x2
|
||||||
|
|
||||||
|
.Lsgemm_tcopy_L2_M8_60:
|
||||||
|
|
||||||
|
tst N , #2
|
||||||
|
ble .Lsgemm_tcopy_L2_M8_80
|
||||||
|
|
||||||
|
COPY2x2
|
||||||
|
|
||||||
|
.Lsgemm_tcopy_L2_M8_80:
|
||||||
|
|
||||||
|
tst N , #1
|
||||||
|
ble .Lsgemm_tcopy_L2_M8_END
|
||||||
|
|
||||||
|
COPY1x2
|
||||||
|
|
||||||
|
.Lsgemm_tcopy_L2_M8_END:
|
||||||
|
|
||||||
|
/*********************************************************************************************/
|
||||||
|
|
||||||
|
.Lsgemm_tcopy_L1_BEGIN:
|
||||||
|
|
||||||
|
tst M, #1
|
||||||
|
ble .Lsgemm_tcopy_L999
|
||||||
|
|
||||||
|
|
||||||
|
.Lsgemm_tcopy_L1_M16_BEGIN:
|
||||||
|
|
||||||
|
mov A01, A // A01 = A
|
||||||
|
mov B00, B
|
||||||
|
|
||||||
|
asr I, N, #3 // I = M / 8
|
||||||
|
cmp I, #0
|
||||||
|
ble .Lsgemm_tcopy_L1_M8_40
|
||||||
|
|
||||||
|
.align 5
|
||||||
|
.Lsgemm_tcopy_L1_M8_20:
|
||||||
|
|
||||||
|
COPY8x1
|
||||||
|
|
||||||
|
subs I , I , #1
|
||||||
|
bne .Lsgemm_tcopy_L1_M8_20
|
||||||
|
|
||||||
|
.Lsgemm_tcopy_L1_M8_40:
|
||||||
|
|
||||||
|
tst N , #4
|
||||||
|
ble .Lsgemm_tcopy_L1_M8_60
|
||||||
|
|
||||||
|
COPY4x1
|
||||||
|
|
||||||
|
.Lsgemm_tcopy_L1_M8_60:
|
||||||
|
|
||||||
|
tst N , #2
|
||||||
|
ble .Lsgemm_tcopy_L1_M8_80
|
||||||
|
|
||||||
|
COPY2x1
|
||||||
|
|
||||||
|
.Lsgemm_tcopy_L1_M8_80:
|
||||||
|
|
||||||
|
tst N , #1
|
||||||
|
ble .Lsgemm_tcopy_L1_M8_END
|
||||||
|
|
||||||
|
COPY1x1
|
||||||
|
|
||||||
|
|
||||||
|
.Lsgemm_tcopy_L1_M8_END:
|
||||||
|
|
||||||
|
.Lsgemm_tcopy_L999:
|
||||||
|
|
||||||
|
mov x0, #0 // set return value
|
||||||
|
RESTORE_REGS
|
||||||
|
ret
|
||||||
|
|
||||||
|
EPILOGUE
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -39,7 +39,7 @@
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
|
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
|
||||||
FLOAT *dummy2, BLASLONG dummy3, FLOAT *dummy4, BLASLONG dummy5,
|
IFLOAT *dummy2, BLASLONG dummy3, IFLOAT *dummy4, BLASLONG dummy5,
|
||||||
FLOAT *c, BLASLONG ldc){
|
FLOAT *c, BLASLONG ldc){
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -39,24 +39,24 @@
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
|
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||||
BLASLONG i, j;
|
BLASLONG i, j;
|
||||||
|
|
||||||
FLOAT *aoffset;
|
IFLOAT *aoffset;
|
||||||
FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
|
IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
|
||||||
FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
|
IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
|
||||||
FLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12;
|
IFLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12;
|
||||||
FLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16;
|
IFLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16;
|
||||||
|
|
||||||
FLOAT *boffset;
|
IFLOAT *boffset;
|
||||||
FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
|
IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
|
||||||
FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
|
IFLOAT ctemp05, ctemp06, ctemp07, ctemp08;
|
||||||
FLOAT ctemp09, ctemp10, ctemp11, ctemp12;
|
IFLOAT ctemp09, ctemp10, ctemp11, ctemp12;
|
||||||
FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
|
IFLOAT ctemp13, ctemp14, ctemp15, ctemp16;
|
||||||
FLOAT ctemp17, ctemp18, ctemp19, ctemp20;
|
IFLOAT ctemp17, ctemp18, ctemp19, ctemp20;
|
||||||
FLOAT ctemp21, ctemp22, ctemp23, ctemp24;
|
IFLOAT ctemp21, ctemp22, ctemp23, ctemp24;
|
||||||
FLOAT ctemp25, ctemp26, ctemp27, ctemp28;
|
IFLOAT ctemp25, ctemp26, ctemp27, ctemp28;
|
||||||
FLOAT ctemp29, ctemp30, ctemp31, ctemp32;
|
IFLOAT ctemp29, ctemp30, ctemp31, ctemp32;
|
||||||
|
|
||||||
aoffset = a;
|
aoffset = a;
|
||||||
boffset = b;
|
boffset = b;
|
||||||
|
|
|
@ -39,10 +39,10 @@
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
|
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||||
BLASLONG i, j;
|
BLASLONG i, j;
|
||||||
FLOAT *a_offset, *a_offset1, *a_offset2;
|
IFLOAT *a_offset, *a_offset1, *a_offset2;
|
||||||
FLOAT *b_offset;
|
IFLOAT *b_offset;
|
||||||
|
|
||||||
a_offset = a;
|
a_offset = a;
|
||||||
b_offset = b;
|
b_offset = b;
|
||||||
|
|
|
@ -39,30 +39,30 @@
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
|
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||||
BLASLONG i, j;
|
BLASLONG i, j;
|
||||||
|
|
||||||
FLOAT *aoffset;
|
IFLOAT *aoffset;
|
||||||
FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
|
IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
|
||||||
FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
|
IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
|
||||||
|
|
||||||
FLOAT *boffset;
|
IFLOAT *boffset;
|
||||||
FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
|
IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
|
||||||
FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
|
IFLOAT ctemp05, ctemp06, ctemp07, ctemp08;
|
||||||
FLOAT ctemp09, ctemp10, ctemp11, ctemp12;
|
IFLOAT ctemp09, ctemp10, ctemp11, ctemp12;
|
||||||
FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
|
IFLOAT ctemp13, ctemp14, ctemp15, ctemp16;
|
||||||
FLOAT ctemp17, ctemp18, ctemp19, ctemp20;
|
IFLOAT ctemp17, ctemp18, ctemp19, ctemp20;
|
||||||
FLOAT ctemp21, ctemp22, ctemp23, ctemp24;
|
IFLOAT ctemp21, ctemp22, ctemp23, ctemp24;
|
||||||
FLOAT ctemp25, ctemp26, ctemp27, ctemp28;
|
IFLOAT ctemp25, ctemp26, ctemp27, ctemp28;
|
||||||
FLOAT ctemp29, ctemp30, ctemp31, ctemp32;
|
IFLOAT ctemp29, ctemp30, ctemp31, ctemp32;
|
||||||
FLOAT ctemp33, ctemp34, ctemp35, ctemp36;
|
IFLOAT ctemp33, ctemp34, ctemp35, ctemp36;
|
||||||
FLOAT ctemp37, ctemp38, ctemp39, ctemp40;
|
IFLOAT ctemp37, ctemp38, ctemp39, ctemp40;
|
||||||
FLOAT ctemp41, ctemp42, ctemp43, ctemp44;
|
IFLOAT ctemp41, ctemp42, ctemp43, ctemp44;
|
||||||
FLOAT ctemp45, ctemp46, ctemp47, ctemp48;
|
IFLOAT ctemp45, ctemp46, ctemp47, ctemp48;
|
||||||
FLOAT ctemp49, ctemp50, ctemp51, ctemp52;
|
IFLOAT ctemp49, ctemp50, ctemp51, ctemp52;
|
||||||
FLOAT ctemp53, ctemp54, ctemp55, ctemp56;
|
IFLOAT ctemp53, ctemp54, ctemp55, ctemp56;
|
||||||
FLOAT ctemp57, ctemp58, ctemp59, ctemp60;
|
IFLOAT ctemp57, ctemp58, ctemp59, ctemp60;
|
||||||
FLOAT ctemp61, ctemp62, ctemp63, ctemp64;
|
IFLOAT ctemp61, ctemp62, ctemp63, ctemp64;
|
||||||
|
|
||||||
|
|
||||||
aoffset = a;
|
aoffset = a;
|
||||||
|
|
|
@ -39,22 +39,22 @@
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
|
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||||
|
|
||||||
BLASLONG i, j;
|
BLASLONG i, j;
|
||||||
|
|
||||||
FLOAT *aoffset;
|
IFLOAT *aoffset;
|
||||||
FLOAT *aoffset1, *aoffset2;
|
IFLOAT *aoffset1, *aoffset2;
|
||||||
FLOAT *boffset;
|
IFLOAT *boffset;
|
||||||
|
|
||||||
FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
|
IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
|
||||||
FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
|
IFLOAT ctemp05, ctemp06, ctemp07, ctemp08;
|
||||||
FLOAT ctemp09, ctemp10, ctemp11, ctemp12;
|
IFLOAT ctemp09, ctemp10, ctemp11, ctemp12;
|
||||||
FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
|
IFLOAT ctemp13, ctemp14, ctemp15, ctemp16;
|
||||||
FLOAT ctemp17, ctemp18, ctemp19, ctemp20;
|
IFLOAT ctemp17, ctemp18, ctemp19, ctemp20;
|
||||||
FLOAT ctemp21, ctemp22, ctemp23, ctemp24;
|
IFLOAT ctemp21, ctemp22, ctemp23, ctemp24;
|
||||||
FLOAT ctemp25, ctemp26, ctemp27, ctemp28;
|
IFLOAT ctemp25, ctemp26, ctemp27, ctemp28;
|
||||||
FLOAT ctemp29, ctemp30, ctemp31, ctemp32;
|
IFLOAT ctemp29, ctemp30, ctemp31, ctemp32;
|
||||||
|
|
||||||
aoffset = a;
|
aoffset = a;
|
||||||
boffset = b;
|
boffset = b;
|
||||||
|
|
|
@ -39,11 +39,11 @@
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
|
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||||
BLASLONG i, j;
|
BLASLONG i, j;
|
||||||
|
|
||||||
FLOAT *a_offset, *a_offset1, *a_offset2;
|
IFLOAT *a_offset, *a_offset1, *a_offset2;
|
||||||
FLOAT *b_offset, *b_offset1, *b_offset2;
|
IFLOAT *b_offset, *b_offset1, *b_offset2;
|
||||||
|
|
||||||
a_offset = a;
|
a_offset = a;
|
||||||
b_offset = b;
|
b_offset = b;
|
||||||
|
|
|
@ -39,32 +39,32 @@
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
|
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||||
|
|
||||||
BLASLONG i, j;
|
BLASLONG i, j;
|
||||||
|
|
||||||
FLOAT *aoffset;
|
IFLOAT *aoffset;
|
||||||
FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
|
IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
|
||||||
FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
|
IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
|
||||||
|
|
||||||
FLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4;
|
IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4;
|
||||||
|
|
||||||
FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
|
IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
|
||||||
FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
|
IFLOAT ctemp05, ctemp06, ctemp07, ctemp08;
|
||||||
FLOAT ctemp09, ctemp10, ctemp11, ctemp12;
|
IFLOAT ctemp09, ctemp10, ctemp11, ctemp12;
|
||||||
FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
|
IFLOAT ctemp13, ctemp14, ctemp15, ctemp16;
|
||||||
FLOAT ctemp17, ctemp18, ctemp19, ctemp20;
|
IFLOAT ctemp17, ctemp18, ctemp19, ctemp20;
|
||||||
FLOAT ctemp21, ctemp22, ctemp23, ctemp24;
|
IFLOAT ctemp21, ctemp22, ctemp23, ctemp24;
|
||||||
FLOAT ctemp25, ctemp26, ctemp27, ctemp28;
|
IFLOAT ctemp25, ctemp26, ctemp27, ctemp28;
|
||||||
FLOAT ctemp29, ctemp30, ctemp31, ctemp32;
|
IFLOAT ctemp29, ctemp30, ctemp31, ctemp32;
|
||||||
FLOAT ctemp33, ctemp34, ctemp35, ctemp36;
|
IFLOAT ctemp33, ctemp34, ctemp35, ctemp36;
|
||||||
FLOAT ctemp37, ctemp38, ctemp39, ctemp40;
|
IFLOAT ctemp37, ctemp38, ctemp39, ctemp40;
|
||||||
FLOAT ctemp41, ctemp42, ctemp43, ctemp44;
|
IFLOAT ctemp41, ctemp42, ctemp43, ctemp44;
|
||||||
FLOAT ctemp45, ctemp46, ctemp47, ctemp48;
|
IFLOAT ctemp45, ctemp46, ctemp47, ctemp48;
|
||||||
FLOAT ctemp49, ctemp50, ctemp51, ctemp52;
|
IFLOAT ctemp49, ctemp50, ctemp51, ctemp52;
|
||||||
FLOAT ctemp53, ctemp54, ctemp55, ctemp56;
|
IFLOAT ctemp53, ctemp54, ctemp55, ctemp56;
|
||||||
FLOAT ctemp57, ctemp58, ctemp59, ctemp60;
|
IFLOAT ctemp57, ctemp58, ctemp59, ctemp60;
|
||||||
FLOAT ctemp61, ctemp62, ctemp63, ctemp64;
|
IFLOAT ctemp61, ctemp62, ctemp63, ctemp64;
|
||||||
|
|
||||||
aoffset = a;
|
aoffset = a;
|
||||||
boffset = b;
|
boffset = b;
|
||||||
|
|
|
@ -1,13 +1,32 @@
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
|
#if defined(HALF) && defined(HALFCONVERSION)
|
||||||
|
static float
|
||||||
|
bfloat16tof32 (bfloat16 f16)
|
||||||
|
{
|
||||||
|
float result = 0;
|
||||||
|
unsigned short* q = (unsigned short*)(&result);
|
||||||
|
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||||
|
q[0] = f16;
|
||||||
|
#else
|
||||||
|
q[1] = f16;
|
||||||
|
#endif
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
#define BF16TOF32(x) (bfloat16tof32(x))
|
||||||
|
#else
|
||||||
|
#define BF16TOF32(x) x
|
||||||
|
#endif
|
||||||
|
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,IFLOAT* ba,IFLOAT* bb,FLOAT* C,BLASLONG ldc
|
||||||
#ifdef TRMMKERNEL
|
#ifdef TRMMKERNEL
|
||||||
,BLASLONG offset
|
,BLASLONG offset
|
||||||
#endif
|
#endif
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
BLASLONG i,j,k;
|
BLASLONG i,j,k;
|
||||||
FLOAT *C0,*C1,*ptrba,*ptrbb;
|
FLOAT *C0,*C1;
|
||||||
FLOAT res0,res1,res2,res3,load0,load1,load2,load3,load4,load5,load6,load7;
|
IFLOAT *ptrba,*ptrbb;
|
||||||
|
FLOAT res0,res1,res2,res3;
|
||||||
|
IFLOAT load0,load1,load2,load3,load4,load5,load6,load7;
|
||||||
for (j=0; j<bn/2; j+=1)
|
for (j=0; j<bn/2; j+=1)
|
||||||
{
|
{
|
||||||
C0 = C;
|
C0 = C;
|
||||||
|
@ -24,36 +43,36 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
|
||||||
{
|
{
|
||||||
load0 = ptrba[2*0+0];
|
load0 = ptrba[2*0+0];
|
||||||
load1 = ptrbb[2*0+0];
|
load1 = ptrbb[2*0+0];
|
||||||
res0 = res0+load0*load1;
|
res0 = res0+BF16TOF32(load0)*BF16TOF32(load1);
|
||||||
load2 = ptrba[2*0+1];
|
load2 = ptrba[2*0+1];
|
||||||
res1 = res1+load2*load1;
|
res1 = res1+BF16TOF32(load2)*BF16TOF32(load1);
|
||||||
load3 = ptrbb[2*0+1];
|
load3 = ptrbb[2*0+1];
|
||||||
res2 = res2+load0*load3;
|
res2 = res2+BF16TOF32(load0)*BF16TOF32(load3);
|
||||||
res3 = res3+load2*load3;
|
res3 = res3+BF16TOF32(load2)*BF16TOF32(load3);
|
||||||
load4 = ptrba[2*1+0];
|
load4 = ptrba[2*1+0];
|
||||||
load5 = ptrbb[2*1+0];
|
load5 = ptrbb[2*1+0];
|
||||||
res0 = res0+load4*load5;
|
res0 = res0+BF16TOF32(load4)*BF16TOF32(load5);
|
||||||
load6 = ptrba[2*1+1];
|
load6 = ptrba[2*1+1];
|
||||||
res1 = res1+load6*load5;
|
res1 = res1+BF16TOF32(load6)*BF16TOF32(load5);
|
||||||
load7 = ptrbb[2*1+1];
|
load7 = ptrbb[2*1+1];
|
||||||
res2 = res2+load4*load7;
|
res2 = res2+BF16TOF32(load4)*BF16TOF32(load7);
|
||||||
res3 = res3+load6*load7;
|
res3 = res3+BF16TOF32(load6)*BF16TOF32(load7);
|
||||||
load0 = ptrba[2*2+0];
|
load0 = ptrba[2*2+0];
|
||||||
load1 = ptrbb[2*2+0];
|
load1 = ptrbb[2*2+0];
|
||||||
res0 = res0+load0*load1;
|
res0 = res0+BF16TOF32(load0)*BF16TOF32(load1);
|
||||||
load2 = ptrba[2*2+1];
|
load2 = ptrba[2*2+1];
|
||||||
res1 = res1+load2*load1;
|
res1 = res1+BF16TOF32(load2)*BF16TOF32(load1);
|
||||||
load3 = ptrbb[2*2+1];
|
load3 = ptrbb[2*2+1];
|
||||||
res2 = res2+load0*load3;
|
res2 = res2+BF16TOF32(load0)*BF16TOF32(load3);
|
||||||
res3 = res3+load2*load3;
|
res3 = res3+BF16TOF32(load2)*BF16TOF32(load3);
|
||||||
load4 = ptrba[2*3+0];
|
load4 = ptrba[2*3+0];
|
||||||
load5 = ptrbb[2*3+0];
|
load5 = ptrbb[2*3+0];
|
||||||
res0 = res0+load4*load5;
|
res0 = res0+BF16TOF32(load4)*BF16TOF32(load5);
|
||||||
load6 = ptrba[2*3+1];
|
load6 = ptrba[2*3+1];
|
||||||
res1 = res1+load6*load5;
|
res1 = res1+BF16TOF32(load6)*BF16TOF32(load5);
|
||||||
load7 = ptrbb[2*3+1];
|
load7 = ptrbb[2*3+1];
|
||||||
res2 = res2+load4*load7;
|
res2 = res2+BF16TOF32(load4)*BF16TOF32(load7);
|
||||||
res3 = res3+load6*load7;
|
res3 = res3+BF16TOF32(load6)*BF16TOF32(load7);
|
||||||
ptrba = ptrba+8;
|
ptrba = ptrba+8;
|
||||||
ptrbb = ptrbb+8;
|
ptrbb = ptrbb+8;
|
||||||
}
|
}
|
||||||
|
@ -61,12 +80,12 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
|
||||||
{
|
{
|
||||||
load0 = ptrba[2*0+0];
|
load0 = ptrba[2*0+0];
|
||||||
load1 = ptrbb[2*0+0];
|
load1 = ptrbb[2*0+0];
|
||||||
res0 = res0+load0*load1;
|
res0 = res0+BF16TOF32(load0)*BF16TOF32(load1);
|
||||||
load2 = ptrba[2*0+1];
|
load2 = ptrba[2*0+1];
|
||||||
res1 = res1+load2*load1;
|
res1 = res1+BF16TOF32(load2)*BF16TOF32(load1);
|
||||||
load3 = ptrbb[2*0+1];
|
load3 = ptrbb[2*0+1];
|
||||||
res2 = res2+load0*load3;
|
res2 = res2+BF16TOF32(load0)*BF16TOF32(load3);
|
||||||
res3 = res3+load2*load3;
|
res3 = res3+BF16TOF32(load2)*BF16TOF32(load3);
|
||||||
ptrba = ptrba+2;
|
ptrba = ptrba+2;
|
||||||
ptrbb = ptrbb+2;
|
ptrbb = ptrbb+2;
|
||||||
}
|
}
|
||||||
|
@ -90,9 +109,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
|
||||||
{
|
{
|
||||||
load0 = ptrba[0+0];
|
load0 = ptrba[0+0];
|
||||||
load1 = ptrbb[2*0+0];
|
load1 = ptrbb[2*0+0];
|
||||||
res0 = res0+load0*load1;
|
res0 = res0+BF16TOF32(load0)*BF16TOF32(load1);
|
||||||
load2 = ptrbb[2*0+1];
|
load2 = ptrbb[2*0+1];
|
||||||
res1 = res1+load0*load2;
|
res1 = res1+BF16TOF32(load0)*BF16TOF32(load2);
|
||||||
ptrba = ptrba+1;
|
ptrba = ptrba+1;
|
||||||
ptrbb = ptrbb+2;
|
ptrbb = ptrbb+2;
|
||||||
}
|
}
|
||||||
|
@ -121,9 +140,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
|
||||||
{
|
{
|
||||||
load0 = ptrba[2*0+0];
|
load0 = ptrba[2*0+0];
|
||||||
load1 = ptrbb[0+0];
|
load1 = ptrbb[0+0];
|
||||||
res0 = res0+load0*load1;
|
res0 = res0+BF16TOF32(load0)*BF16TOF32(load1);
|
||||||
load2 = ptrba[2*0+1];
|
load2 = ptrba[2*0+1];
|
||||||
res1 = res1+load2*load1;
|
res1 = res1+BF16TOF32(load2)*BF16TOF32(load1);
|
||||||
ptrba = ptrba+2;
|
ptrba = ptrba+2;
|
||||||
ptrbb = ptrbb+1;
|
ptrbb = ptrbb+1;
|
||||||
}
|
}
|
||||||
|
@ -141,7 +160,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
|
||||||
{
|
{
|
||||||
load0 = ptrba[0+0];
|
load0 = ptrba[0+0];
|
||||||
load1 = ptrbb[0+0];
|
load1 = ptrbb[0+0];
|
||||||
res0 = res0+load0*load1;
|
res0 = res0+BF16TOF32(load0)*BF16TOF32(load1);
|
||||||
ptrba = ptrba+1;
|
ptrba = ptrba+1;
|
||||||
ptrbb = ptrbb+1;
|
ptrbb = ptrbb+1;
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
include $(KERNELDIR)/KERNEL.P5600
|
|
@ -0,0 +1,225 @@
|
||||||
|
ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__)
|
||||||
|
include $(KERNELDIR)/KERNEL.POWER8
|
||||||
|
else
|
||||||
|
|
||||||
|
#SGEMM_BETA = ../generic/gemm_beta.c
|
||||||
|
#DGEMM_BETA = ../generic/gemm_beta.c
|
||||||
|
#CGEMM_BETA = ../generic/zgemm_beta.c
|
||||||
|
#ZGEMM_BETA = ../generic/zgemm_beta.c
|
||||||
|
|
||||||
|
SHGEMM_BETA = ../generic/gemm_beta.c
|
||||||
|
SHGEMMKERNEL = shgemm_kernel_power10.c
|
||||||
|
SHGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||||
|
SHGEMMITCOPY = ../generic/gemm_tcopy_16.c
|
||||||
|
SHGEMMONCOPY = ../generic/gemm_ncopy_8.c
|
||||||
|
SHGEMMOTCOPY = ../generic/gemm_tcopy_8.c
|
||||||
|
SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
|
STRMMKERNEL = sgemm_kernel_power10.c
|
||||||
|
DTRMMKERNEL = dgemm_kernel_power10.c
|
||||||
|
CTRMMKERNEL = cgemm_kernel_power10.S
|
||||||
|
ZTRMMKERNEL = zgemm_kernel_power10.S
|
||||||
|
|
||||||
|
SGEMMKERNEL = sgemm_kernel_power10.c
|
||||||
|
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||||
|
SGEMMITCOPY = sgemm_tcopy_16_power8.S
|
||||||
|
SGEMMONCOPY = ../generic/gemm_ncopy_8.c
|
||||||
|
SGEMMOTCOPY = sgemm_tcopy_8_power8.S
|
||||||
|
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
|
DGEMMKERNEL = dgemm_kernel_power10.c
|
||||||
|
DGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||||
|
DGEMMITCOPY = dgemm_tcopy_16_power8.S
|
||||||
|
DGEMMONCOPY = dgemm_ncopy_4_power8.S
|
||||||
|
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||||
|
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
|
CGEMMKERNEL = cgemm_kernel_power10.S
|
||||||
|
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
||||||
|
CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
|
||||||
|
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
||||||
|
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
||||||
|
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
|
ZGEMMKERNEL = zgemm_kernel_power10.S
|
||||||
|
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||||
|
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||||
|
ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
||||||
|
ZGEMMITCOPY = zgemm_tcopy_8_power8.S
|
||||||
|
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
|
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
|
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
|
DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S
|
||||||
|
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
|
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
|
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
#Todo: CGEMM3MKERNEL should be 4x4 blocksizes.
|
||||||
|
#CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse3.S
|
||||||
|
#ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse3.S
|
||||||
|
|
||||||
|
#Pure C for other kernels
|
||||||
|
#SAMAXKERNEL = ../arm/amax.c
|
||||||
|
#DAMAXKERNEL = ../arm/amax.c
|
||||||
|
#CAMAXKERNEL = ../arm/zamax.c
|
||||||
|
#ZAMAXKERNEL = ../arm/zamax.c
|
||||||
|
#
|
||||||
|
#SAMINKERNEL = ../arm/amin.c
|
||||||
|
#DAMINKERNEL = ../arm/amin.c
|
||||||
|
#CAMINKERNEL = ../arm/zamin.c
|
||||||
|
#ZAMINKERNEL = ../arm/zamin.c
|
||||||
|
#
|
||||||
|
#SMAXKERNEL = ../arm/max.c
|
||||||
|
#DMAXKERNEL = ../arm/max.c
|
||||||
|
#
|
||||||
|
#SMINKERNEL = ../arm/min.c
|
||||||
|
#DMINKERNEL = ../arm/min.c
|
||||||
|
#
|
||||||
|
ifneq ($(GCCVERSIONGTEQ9),1)
|
||||||
|
ISAMAXKERNEL = isamax_power9.S
|
||||||
|
else
|
||||||
|
ISAMAXKERNEL = isamax.c
|
||||||
|
endif
|
||||||
|
IDAMAXKERNEL = idamax.c
|
||||||
|
ifneq ($(GCCVERSIONGTEQ9),1)
|
||||||
|
ICAMAXKERNEL = icamax_power9.S
|
||||||
|
else
|
||||||
|
ICAMAXKERNEL = icamax.c
|
||||||
|
endif
|
||||||
|
IZAMAXKERNEL = izamax.c
|
||||||
|
#
|
||||||
|
ifneq ($(GCCVERSIONGTEQ9),1)
|
||||||
|
ISAMINKERNEL = isamin_power9.S
|
||||||
|
else
|
||||||
|
ISAMINKERNEL = isamin.c
|
||||||
|
endif
|
||||||
|
IDAMINKERNEL = idamin.c
|
||||||
|
ifneq ($(GCCVERSIONGTEQ9),1)
|
||||||
|
ICAMINKERNEL = icamin_power9.S
|
||||||
|
else
|
||||||
|
ICAMINKERNEL = icamin.c
|
||||||
|
endif
|
||||||
|
IZAMINKERNEL = izamin.c
|
||||||
|
#
|
||||||
|
#ISMAXKERNEL = ../arm/imax.c
|
||||||
|
#IDMAXKERNEL = ../arm/imax.c
|
||||||
|
#
|
||||||
|
#ISMINKERNEL = ../arm/imin.c
|
||||||
|
#IDMINKERNEL = ../arm/imin.c
|
||||||
|
#
|
||||||
|
SASUMKERNEL = sasum.c
|
||||||
|
DASUMKERNEL = dasum.c
|
||||||
|
CASUMKERNEL = casum.c
|
||||||
|
ZASUMKERNEL = zasum.c
|
||||||
|
#
|
||||||
|
SAXPYKERNEL = saxpy.c
|
||||||
|
DAXPYKERNEL = daxpy.c
|
||||||
|
ifneq ($(GCCVERSIONGTEQ9),1)
|
||||||
|
CAXPYKERNEL = caxpy_power9.S
|
||||||
|
else
|
||||||
|
CAXPYKERNEL = caxpy.c
|
||||||
|
endif
|
||||||
|
ZAXPYKERNEL = zaxpy.c
|
||||||
|
#
|
||||||
|
SCOPYKERNEL = scopy.c
|
||||||
|
DCOPYKERNEL = dcopy.c
|
||||||
|
CCOPYKERNEL = ccopy.c
|
||||||
|
ZCOPYKERNEL = zcopy.c
|
||||||
|
#
|
||||||
|
SDOTKERNEL = sdot.c
|
||||||
|
DDOTKERNEL = ddot.c
|
||||||
|
DSDOTKERNEL = sdot.c
|
||||||
|
ifneq ($(GCCVERSIONGTEQ9),1)
|
||||||
|
CDOTKERNEL = cdot_power9.S
|
||||||
|
else
|
||||||
|
CDOTKERNEL = cdot.c
|
||||||
|
endif
|
||||||
|
ZDOTKERNEL = zdot.c
|
||||||
|
#
|
||||||
|
SNRM2KERNEL = ../arm/nrm2.c
|
||||||
|
DNRM2KERNEL = ../arm/nrm2.c
|
||||||
|
CNRM2KERNEL = ../arm/znrm2.c
|
||||||
|
ZNRM2KERNEL = ../arm/znrm2.c
|
||||||
|
#
|
||||||
|
SROTKERNEL = srot.c
|
||||||
|
DROTKERNEL = drot.c
|
||||||
|
CROTKERNEL = crot.c
|
||||||
|
ZROTKERNEL = zrot.c
|
||||||
|
#
|
||||||
|
SSCALKERNEL = sscal.c
|
||||||
|
DSCALKERNEL = dscal.c
|
||||||
|
CSCALKERNEL = zscal.c
|
||||||
|
ZSCALKERNEL = zscal.c
|
||||||
|
#
|
||||||
|
SSWAPKERNEL = sswap.c
|
||||||
|
DSWAPKERNEL = dswap.c
|
||||||
|
CSWAPKERNEL = cswap.c
|
||||||
|
ZSWAPKERNEL = zswap.c
|
||||||
|
#
|
||||||
|
|
||||||
|
SGEMVNKERNEL = sgemv_n.c
|
||||||
|
DGEMVNKERNEL = dgemv_n.c
|
||||||
|
CGEMVNKERNEL = cgemv_n.c
|
||||||
|
ZGEMVNKERNEL = zgemv_n_4.c
|
||||||
|
#
|
||||||
|
SGEMVTKERNEL = sgemv_t.c
|
||||||
|
DGEMVTKERNEL = dgemv_t.c
|
||||||
|
CGEMVTKERNEL = cgemv_t.c
|
||||||
|
ZGEMVTKERNEL = zgemv_t_4.c
|
||||||
|
|
||||||
|
|
||||||
|
#SSYMV_U_KERNEL = ../generic/symv_k.c
|
||||||
|
#SSYMV_L_KERNEL = ../generic/symv_k.c
|
||||||
|
#DSYMV_U_KERNEL = ../generic/symv_k.c
|
||||||
|
#DSYMV_L_KERNEL = ../generic/symv_k.c
|
||||||
|
#QSYMV_U_KERNEL = ../generic/symv_k.c
|
||||||
|
#QSYMV_L_KERNEL = ../generic/symv_k.c
|
||||||
|
#CSYMV_U_KERNEL = ../generic/zsymv_k.c
|
||||||
|
#CSYMV_L_KERNEL = ../generic/zsymv_k.c
|
||||||
|
#ZSYMV_U_KERNEL = ../generic/zsymv_k.c
|
||||||
|
#ZSYMV_L_KERNEL = ../generic/zsymv_k.c
|
||||||
|
#XSYMV_U_KERNEL = ../generic/zsymv_k.c
|
||||||
|
#XSYMV_L_KERNEL = ../generic/zsymv_k.c
|
||||||
|
|
||||||
|
#ZHEMV_U_KERNEL = ../generic/zhemv_k.c
|
||||||
|
#ZHEMV_L_KERNEL = ../generic/zhemv_k.c
|
||||||
|
|
||||||
|
LSAME_KERNEL = ../generic/lsame.c
|
||||||
|
SCABS_KERNEL = ../generic/cabs.c
|
||||||
|
DCABS_KERNEL = ../generic/cabs.c
|
||||||
|
QCABS_KERNEL = ../generic/cabs.c
|
||||||
|
|
||||||
|
#Dump kernel
|
||||||
|
CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
|
||||||
|
ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
|
||||||
|
|
||||||
|
endif
|
|
@ -232,3 +232,11 @@ QCABS_KERNEL = ../generic/cabs.c
|
||||||
#Dump kernel
|
#Dump kernel
|
||||||
CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
|
CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
|
||||||
ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
|
ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
|
||||||
|
|
||||||
|
ifeq ($(__BYTE_ORDER__)$(ELF_VERSION),__ORDER_BIG_ENDIAN__2)
|
||||||
|
IDAMAXKERNEL = ../arm/iamax.c
|
||||||
|
IDAMINKERNEL = ../arm/iamin.c
|
||||||
|
IZAMAXKERNEL = ../arm/izamax.c
|
||||||
|
IZAMINKERNEL = ../arm/izamin.c
|
||||||
|
endif
|
||||||
|
|
||||||
|
|
|
@ -20,8 +20,10 @@ ZAXPYKERNEL = zaxpy_ppc440.S
|
||||||
|
|
||||||
SDOTKERNEL = dot_ppc440.S
|
SDOTKERNEL = dot_ppc440.S
|
||||||
DDOTKERNEL = dot_ppc440.S
|
DDOTKERNEL = dot_ppc440.S
|
||||||
CDOTKERNEL = zdot_ppc440.S
|
#CDOTKERNEL = zdot_ppc440.S
|
||||||
ZDOTKERNEL = zdot_ppc440.S
|
#ZDOTKERNEL = zdot_ppc440.S
|
||||||
|
CDOTKERNEL = ../arm/zdot.c
|
||||||
|
ZDOTKERNEL = ../arm/zdot.c
|
||||||
|
|
||||||
ISAMAXKERNEL = iamax_ppc440.S
|
ISAMAXKERNEL = iamax_ppc440.S
|
||||||
IDAMAXKERNEL = iamax_ppc440.S
|
IDAMAXKERNEL = iamax_ppc440.S
|
||||||
|
@ -52,8 +54,11 @@ ZNRM2KERNEL = znrm2_ppc440.S
|
||||||
|
|
||||||
SROTKERNEL = rot_ppc440.S
|
SROTKERNEL = rot_ppc440.S
|
||||||
DROTKERNEL = rot_ppc440.S
|
DROTKERNEL = rot_ppc440.S
|
||||||
CROTKERNEL = zrot_ppc440.S
|
#CROTKERNEL = zrot_ppc440.S
|
||||||
ZROTKERNEL = zrot_ppc440.S
|
#ZROTKERNEL = zrot_ppc440.S
|
||||||
|
CROTKERNEL = ../arm/zrot.c
|
||||||
|
ZROTKERNEL = ../arm/zrot.c
|
||||||
|
|
||||||
|
|
||||||
SSCALKERNEL = scal_ppc440.S
|
SSCALKERNEL = scal_ppc440.S
|
||||||
DSCALKERNEL = scal_ppc440.S
|
DSCALKERNEL = scal_ppc440.S
|
||||||
|
@ -78,13 +83,18 @@ DGEMMINCOPYOBJ =
|
||||||
DGEMMITCOPYOBJ =
|
DGEMMITCOPYOBJ =
|
||||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
CGEMMKERNEL = zgemm_kernel_altivec_g4.S
|
#CGEMMKERNEL = zgemm_kernel_altivec_g4.S
|
||||||
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
#CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
||||||
CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
|
#CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
|
||||||
|
CGEMMKERNEL = zgemm_kernel.S
|
||||||
|
CGEMMINCOPY =
|
||||||
|
CGEMMONCOPY =
|
||||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
CGEMMINCOPYOBJ =
|
||||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
#cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
CGEMMITCOPYOBJ =
|
||||||
|
#cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
ZGEMMKERNEL = zgemm_kernel_g4.S
|
ZGEMMKERNEL = zgemm_kernel_g4.S
|
||||||
|
|
|
@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(POWER8) || defined(POWER9)
|
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||||
#include "casum_microk_power8.c"
|
#include "casum_microk_power8.c"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#if defined(POWER8) || defined(POWER9)
|
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||||
#include "ccopy_microk_power8.c"
|
#include "ccopy_microk_power8.c"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -424,7 +424,7 @@ L999:
|
||||||
lwz r16, 204(SP)
|
lwz r16, 204(SP)
|
||||||
lwz r15, 208(SP)
|
lwz r15, 208(SP)
|
||||||
lwz r14, 212(SP)
|
lwz r14, 212(SP)
|
||||||
addi r11, 224
|
addi r11, SP, 224
|
||||||
#endif
|
#endif
|
||||||
lvx v20, r11, r0
|
lvx v20, r11, r0
|
||||||
addi r11, r11, 16
|
addi r11, r11, 16
|
||||||
|
@ -459,4 +459,4 @@ L999:
|
||||||
blr
|
blr
|
||||||
|
|
||||||
EPILOGUE
|
EPILOGUE
|
||||||
#endif^
|
#endif
|
||||||
|
|
|
@ -0,0 +1,286 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2013-2020, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#define ASSEMBLER
|
||||||
|
#include "common.h"
|
||||||
|
#include "def_vsx.h"
|
||||||
|
|
||||||
|
|
||||||
|
#define LOAD ld
|
||||||
|
#define STACKSIZE (512 )
|
||||||
|
#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */
|
||||||
|
#define M r3
|
||||||
|
#define N r4
|
||||||
|
#define K r5
|
||||||
|
|
||||||
|
|
||||||
|
#define A r8
|
||||||
|
#define B r9
|
||||||
|
#define C r10
|
||||||
|
#define LDC r6
|
||||||
|
#define OFFSET r7
|
||||||
|
|
||||||
|
|
||||||
|
#define alpha_r vs51
|
||||||
|
#define alpha_i vs55
|
||||||
|
#define save_permute_1 vs59
|
||||||
|
#define permute_mask vs63
|
||||||
|
#define o0 0
|
||||||
|
|
||||||
|
|
||||||
|
#define T1 r11
|
||||||
|
#define T2 r12
|
||||||
|
#define T3 r14
|
||||||
|
#define T4 r15
|
||||||
|
#define T5 r16
|
||||||
|
#define T6 r17
|
||||||
|
#define L r18
|
||||||
|
#define T7 r19
|
||||||
|
#define T8 r20
|
||||||
|
#define TEMP_REG r21
|
||||||
|
#define I r22
|
||||||
|
#define J r23
|
||||||
|
#define AO r24
|
||||||
|
#define BO r25
|
||||||
|
#define CO r26
|
||||||
|
#define T9 r27
|
||||||
|
#define T10 r28
|
||||||
|
#define PRE r29
|
||||||
|
|
||||||
|
#define T12 r30
|
||||||
|
#define T13 r31
|
||||||
|
|
||||||
|
#include "cgemm_macros_power10.S"
|
||||||
|
|
||||||
|
.equ perm_const1, 0x0405060700010203
|
||||||
|
.equ perm_const2, 0x0c0d0e0f08090a0b
|
||||||
|
.equ save_permute_12, 0x0c0d0e0f1c1d1e1f
|
||||||
|
.equ save_permute_11, 0x0405060714151617
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#ifndef NEEDPARAM
|
||||||
|
|
||||||
|
PROLOGUE
|
||||||
|
PROFCODE
|
||||||
|
|
||||||
|
|
||||||
|
addi SP, SP, -STACKSIZE
|
||||||
|
mflr r0
|
||||||
|
|
||||||
|
|
||||||
|
stfd f14, 0(SP)
|
||||||
|
stfd f15, 8(SP)
|
||||||
|
stfd f16, 16(SP)
|
||||||
|
stfd f17, 24(SP)
|
||||||
|
|
||||||
|
stfd f18, 32(SP)
|
||||||
|
stfd f19, 40(SP)
|
||||||
|
stfd f20, 48(SP)
|
||||||
|
stfd f21, 56(SP)
|
||||||
|
|
||||||
|
stfd f22, 64(SP)
|
||||||
|
stfd f23, 72(SP)
|
||||||
|
stfd f24, 80(SP)
|
||||||
|
stfd f25, 88(SP)
|
||||||
|
|
||||||
|
stfd f26, 96(SP)
|
||||||
|
stfd f27, 104(SP)
|
||||||
|
stfd f28, 112(SP)
|
||||||
|
stfd f29, 120(SP)
|
||||||
|
|
||||||
|
stfd f30, 128(SP)
|
||||||
|
stfd f31, 136(SP)
|
||||||
|
|
||||||
|
|
||||||
|
std r31, 144(SP)
|
||||||
|
std r30, 152(SP)
|
||||||
|
std r29, 160(SP)
|
||||||
|
std r28, 168(SP)
|
||||||
|
std r27, 176(SP)
|
||||||
|
std r26, 184(SP)
|
||||||
|
std r25, 192(SP)
|
||||||
|
std r24, 200(SP)
|
||||||
|
std r23, 208(SP)
|
||||||
|
std r22, 216(SP)
|
||||||
|
std r21, 224(SP)
|
||||||
|
std r20, 232(SP)
|
||||||
|
std r19, 240(SP)
|
||||||
|
std r18, 248(SP)
|
||||||
|
std r17, 256(SP)
|
||||||
|
std r16, 264(SP)
|
||||||
|
std r15, 272(SP)
|
||||||
|
std r14, 280(SP)
|
||||||
|
|
||||||
|
|
||||||
|
stxv vs52, 288(SP)
|
||||||
|
stxv vs53, 304(SP)
|
||||||
|
stxv vs54, 320(SP)
|
||||||
|
stxv vs55, 336(SP)
|
||||||
|
stxv vs56, 352(SP)
|
||||||
|
stxv vs57, 368(SP)
|
||||||
|
stxv vs58, 384(SP)
|
||||||
|
stxv vs59, 400(SP)
|
||||||
|
stxv vs60, 416(SP)
|
||||||
|
stxv vs61, 432(SP)
|
||||||
|
stxv vs62, 448(SP)
|
||||||
|
stxv vs63, 464(SP)
|
||||||
|
std r0, FLINK_SAVE(SP)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef TRMMKERNEL
|
||||||
|
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
|
||||||
|
#endif
|
||||||
|
slwi LDC, LDC, ZBASE_SHIFT
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*alpha is stored in f1. convert to single and splat*/
|
||||||
|
xscvdpspn alpha_r,vs1
|
||||||
|
xscvdpspn alpha_i,vs2
|
||||||
|
xxspltw alpha_r,alpha_r,0
|
||||||
|
xxspltw alpha_i,alpha_i,0
|
||||||
|
/*load reverse permute mask for big endian
|
||||||
|
uint128 = 0xc0d0e0f08090a0b0405060700010203
|
||||||
|
*/
|
||||||
|
|
||||||
|
lis T2, perm_const2@highest
|
||||||
|
lis T1, perm_const1@highest
|
||||||
|
lis T3, save_permute_12@highest
|
||||||
|
lis T4, save_permute_11@highest
|
||||||
|
|
||||||
|
|
||||||
|
ori T2, T2, perm_const2@higher
|
||||||
|
ori T1, T1, perm_const1@higher
|
||||||
|
ori T3, T3, save_permute_12@higher
|
||||||
|
ori T4, T4, save_permute_11@higher
|
||||||
|
|
||||||
|
|
||||||
|
rldicr T2, T2, 32, 31
|
||||||
|
rldicr T1, T1, 32, 31
|
||||||
|
rldicr T3, T3, 32, 31
|
||||||
|
rldicr T4, T4, 32, 31
|
||||||
|
|
||||||
|
oris T2, T2, perm_const2@h
|
||||||
|
oris T1, T1, perm_const1@h
|
||||||
|
oris T3, T3, save_permute_12@h
|
||||||
|
oris T4, T4, save_permute_11@h
|
||||||
|
|
||||||
|
|
||||||
|
ori T2, T2, perm_const2@l
|
||||||
|
ori T1, T1, perm_const1@l
|
||||||
|
ori T3, T3, save_permute_12@l
|
||||||
|
ori T4, T4, save_permute_11@l
|
||||||
|
|
||||||
|
|
||||||
|
li r0,0
|
||||||
|
li PRE,512
|
||||||
|
|
||||||
|
#if defined(CC) || defined(CR) || defined(RC) || defined(RR)
|
||||||
|
/*negate for this case as we will use addition -1*(a+b) */
|
||||||
|
xvnegsp alpha_r,alpha_r
|
||||||
|
xvnegsp alpha_i,alpha_i
|
||||||
|
#endif
|
||||||
|
|
||||||
|
mtvsrdd permute_mask,T2,T1
|
||||||
|
mtvsrdd save_permute_1,T3,T4
|
||||||
|
|
||||||
|
/*mask is reverse permute so we have to make it inner permute */
|
||||||
|
xxpermdi permute_mask, permute_mask, permute_mask,2
|
||||||
|
|
||||||
|
#include "cgemm_logic_power10.S"
|
||||||
|
|
||||||
|
.L999:
|
||||||
|
lfd f14, 0(SP)
|
||||||
|
lfd f15, 8(SP)
|
||||||
|
lfd f16, 16(SP)
|
||||||
|
lfd f17, 24(SP)
|
||||||
|
|
||||||
|
lfd f18, 32(SP)
|
||||||
|
lfd f19, 40(SP)
|
||||||
|
lfd f20, 48(SP)
|
||||||
|
lfd f21, 56(SP)
|
||||||
|
|
||||||
|
lfd f22, 64(SP)
|
||||||
|
lfd f23, 72(SP)
|
||||||
|
lfd f24, 80(SP)
|
||||||
|
lfd f25, 88(SP)
|
||||||
|
|
||||||
|
lfd f26, 96(SP)
|
||||||
|
lfd f27, 104(SP)
|
||||||
|
lfd f28, 112(SP)
|
||||||
|
lfd f29, 120(SP)
|
||||||
|
|
||||||
|
lfd f30, 128(SP)
|
||||||
|
lfd f31, 136(SP)
|
||||||
|
|
||||||
|
ld r31, 144(SP)
|
||||||
|
ld r30, 152(SP)
|
||||||
|
ld r29, 160(SP)
|
||||||
|
ld r28, 168(SP)
|
||||||
|
ld r27, 176(SP)
|
||||||
|
ld r26, 184(SP)
|
||||||
|
ld r25, 192(SP)
|
||||||
|
ld r24, 200(SP)
|
||||||
|
ld r23, 208(SP)
|
||||||
|
ld r22, 216(SP)
|
||||||
|
ld r21, 224(SP)
|
||||||
|
ld r20, 232(SP)
|
||||||
|
ld r19, 240(SP)
|
||||||
|
ld r18, 248(SP)
|
||||||
|
ld r17, 256(SP)
|
||||||
|
ld r16, 264(SP)
|
||||||
|
ld r15, 272(SP)
|
||||||
|
ld r14, 280(SP)
|
||||||
|
|
||||||
|
ld r0, FLINK_SAVE(SP)
|
||||||
|
|
||||||
|
lxv vs52, 288(SP)
|
||||||
|
lxv vs53, 304(SP)
|
||||||
|
lxv vs54, 320(SP)
|
||||||
|
lxv vs55, 336(SP)
|
||||||
|
lxv vs56, 352(SP)
|
||||||
|
lxv vs57, 368(SP)
|
||||||
|
lxv vs58, 384(SP)
|
||||||
|
lxv vs59, 400(SP)
|
||||||
|
mtlr r0
|
||||||
|
lxv vs60, 416(SP)
|
||||||
|
lxv vs61, 432(SP)
|
||||||
|
lxv vs62, 448(SP)
|
||||||
|
lxv vs63, 464(SP)
|
||||||
|
|
||||||
|
addi SP, SP, STACKSIZE
|
||||||
|
blr
|
||||||
|
|
||||||
|
|
||||||
|
EPILOGUE
|
||||||
|
#endif
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#if defined(POWER8) || defined(POWER9)
|
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||||
|
|
||||||
static void crot_kernel_8 (long n, float *x, float *y, float c, float s)
|
static void crot_kernel_8 (long n, float *x, float *y, float c, float s)
|
||||||
{
|
{
|
||||||
|
|
|
@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
|
|
||||||
#if defined(POWER8) || defined(POWER9)
|
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||||
#include "cswap_microk_power8.c"
|
#include "cswap_microk_power8.c"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(POWER8) || defined(POWER9)
|
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||||
#include "dasum_microk_power8.c"
|
#include "dasum_microk_power8.c"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
|
|
||||||
#if defined(POWER8) || defined(POWER9)
|
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||||
#include "daxpy_microk_power8.c"
|
#include "daxpy_microk_power8.c"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#if defined(POWER8) || defined(POWER9)
|
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||||
#include "dcopy_microk_power8.c"
|
#include "dcopy_microk_power8.c"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
|
|
||||||
#if defined(POWER8) || defined(POWER9)
|
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||||
#include "ddot_microk_power8.c"
|
#include "ddot_microk_power8.c"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,864 @@
|
||||||
|
/*********************************************************************************
|
||||||
|
Copyright (c) 2020, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
**********************************************************************************/
|
||||||
|
#include "common.h"
|
||||||
|
#include <altivec.h>
|
||||||
|
|
||||||
|
typedef unsigned char vec_t __attribute__ ((vector_size (16)));
|
||||||
|
typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
|
||||||
|
typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
|
||||||
|
|
||||||
|
#ifdef TRMMKERNEL
|
||||||
|
#define SAVE_ACC(ACC, J) \
|
||||||
|
__builtin_mma_disassemble_acc (result, ACC); \
|
||||||
|
rowC = (v4sf_t *) &CO[0* ldc+J]; \
|
||||||
|
rowC[0] = result[3] * alpha; \
|
||||||
|
rowC = (v4sf_t *) &CO[1*ldc+J]; \
|
||||||
|
rowC[0] = result[2] * alpha; \
|
||||||
|
rowC = (v4sf_t *) &CO[2*ldc+J]; \
|
||||||
|
rowC[0] = result[1] * alpha; \
|
||||||
|
rowC = (v4sf_t *) &CO[3*ldc+J]; \
|
||||||
|
rowC[0] = result[0] * alpha;
|
||||||
|
#define SAVE_ACC1(ACC, J) \
|
||||||
|
__builtin_mma_disassemble_acc (result, ACC); \
|
||||||
|
rowC = (v4sf_t *) &CO[4* ldc+J]; \
|
||||||
|
rowC[0] = result[3] * alpha; \
|
||||||
|
rowC = (v4sf_t *) &CO[5*ldc+J]; \
|
||||||
|
rowC[0] = result[2] * alpha; \
|
||||||
|
rowC = (v4sf_t *) &CO[6*ldc+J]; \
|
||||||
|
rowC[0] = result[1] * alpha; \
|
||||||
|
rowC = (v4sf_t *) &CO[7*ldc+J]; \
|
||||||
|
rowC[0] = result[0] * alpha;
|
||||||
|
#define SAVE2x4_ACC(ACC, J) \
|
||||||
|
__builtin_mma_disassemble_acc (result, ACC); \
|
||||||
|
rowC = (v4sf_t *) &CO[0* ldc+J]; \
|
||||||
|
rowC[0] = result[3] * alpha; \
|
||||||
|
rowC = (v4sf_t *) &CO[1* ldc+J]; \
|
||||||
|
rowC[0] = result[2] * alpha;
|
||||||
|
#else
|
||||||
|
#define SAVE_ACC(ACC, J) \
|
||||||
|
__builtin_mma_disassemble_acc (result, ACC); \
|
||||||
|
rowC = (v4sf_t *) &CO[0* ldc+J]; \
|
||||||
|
rowC[0] += result[3] * alpha; \
|
||||||
|
rowC = (v4sf_t *) &CO[1*ldc+J]; \
|
||||||
|
rowC[0] += result[2] * alpha; \
|
||||||
|
rowC = (v4sf_t *) &CO[2*ldc+J]; \
|
||||||
|
rowC[0] += result[1] * alpha; \
|
||||||
|
rowC = (v4sf_t *) &CO[3*ldc+J]; \
|
||||||
|
rowC[0] += result[0] * alpha;
|
||||||
|
#define SAVE_ACC1(ACC, J) \
|
||||||
|
__builtin_mma_disassemble_acc (result, ACC); \
|
||||||
|
rowC = (v4sf_t *) &CO[4* ldc+J]; \
|
||||||
|
rowC[0] += result[3] * alpha; \
|
||||||
|
rowC = (v4sf_t *) &CO[5*ldc+J]; \
|
||||||
|
rowC[0] += result[2] * alpha; \
|
||||||
|
rowC = (v4sf_t *) &CO[6*ldc+J]; \
|
||||||
|
rowC[0] += result[1] * alpha; \
|
||||||
|
rowC = (v4sf_t *) &CO[7*ldc+J]; \
|
||||||
|
rowC[0] += result[0] * alpha;
|
||||||
|
#define SAVE2x4_ACC(ACC, J) \
|
||||||
|
__builtin_mma_disassemble_acc (result, ACC); \
|
||||||
|
rowC = (v4sf_t *) &CO[0* ldc+J]; \
|
||||||
|
rowC[0] += result[3] * alpha; \
|
||||||
|
rowC = (v4sf_t *) &CO[1* ldc+J]; \
|
||||||
|
rowC[0] += result[2] * alpha;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define SET_ACC_ZERO4() \
|
||||||
|
__builtin_mma_xxsetaccz (&acc0); \
|
||||||
|
__builtin_mma_xxsetaccz (&acc1); \
|
||||||
|
__builtin_mma_xxsetaccz (&acc2); \
|
||||||
|
__builtin_mma_xxsetaccz (&acc3);
|
||||||
|
|
||||||
|
#define SET_ACC_ZERO8() \
|
||||||
|
__builtin_mma_xxsetaccz (&acc0); \
|
||||||
|
__builtin_mma_xxsetaccz (&acc1); \
|
||||||
|
__builtin_mma_xxsetaccz (&acc2); \
|
||||||
|
__builtin_mma_xxsetaccz (&acc3); \
|
||||||
|
__builtin_mma_xxsetaccz (&acc4); \
|
||||||
|
__builtin_mma_xxsetaccz (&acc5); \
|
||||||
|
__builtin_mma_xxsetaccz (&acc6); \
|
||||||
|
__builtin_mma_xxsetaccz (&acc7);
|
||||||
|
|
||||||
|
#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
|
||||||
|
|
||||||
|
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||||
|
#define REFRESH_TEMP_BK(x, y) \
|
||||||
|
temp = k - off;
|
||||||
|
#elif defined(LEFT)
|
||||||
|
#define REFRESH_TEMP_BK(x, y) \
|
||||||
|
temp = off + x;
|
||||||
|
#else
|
||||||
|
#define REFRESH_TEMP_BK(x, y) \
|
||||||
|
temp = off + y;
|
||||||
|
#endif
|
||||||
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||||
|
#define REFRESH_POINTERS(x, y) \
|
||||||
|
BO = B; \
|
||||||
|
REFRESH_TEMP_BK(x, y)
|
||||||
|
#else
|
||||||
|
#define REFRESH_POINTERS(x, y) \
|
||||||
|
AO += off * x; \
|
||||||
|
BO = B + off * y; \
|
||||||
|
REFRESH_TEMP_BK(x, y)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef LEFT
|
||||||
|
#define REFRESH_OFF(x) \
|
||||||
|
off += x;
|
||||||
|
#else
|
||||||
|
#define REFRESH_OFF(x)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef LEFT
|
||||||
|
#define UPDATE_TEMP(x, y) \
|
||||||
|
temp -= x;
|
||||||
|
#else
|
||||||
|
#define UPDATE_TEMP(x, y) \
|
||||||
|
temp -= y;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||||
|
#define REFRESH_TMP_AFTER_SAVE(x, y) \
|
||||||
|
temp = k - off; \
|
||||||
|
UPDATE_TEMP(x, y) \
|
||||||
|
AO += temp * x; \
|
||||||
|
BO += temp * y;
|
||||||
|
#else
|
||||||
|
#define REFRESH_TMP_AFTER_SAVE(x, y)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define REFRESH_AFTER_SAVE(x,y) \
|
||||||
|
REFRESH_TMP_AFTER_SAVE(x, y) \
|
||||||
|
REFRESH_OFF(x)
|
||||||
|
/*************************************************************************************
|
||||||
|
* GEMM Kernel
|
||||||
|
*************************************************************************************/
|
||||||
|
int
|
||||||
|
CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||||
|
FLOAT * C, BLASLONG ldc
|
||||||
|
#ifdef TRMMKERNEL
|
||||||
|
, BLASLONG offset
|
||||||
|
#endif
|
||||||
|
)
|
||||||
|
{
|
||||||
|
BLASLONG N = n;
|
||||||
|
BLASLONG i1;
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
BLASLONG off;
|
||||||
|
#endif
|
||||||
|
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||||
|
off = -offset;
|
||||||
|
#endif
|
||||||
|
v4sf_t valpha = { alpha, alpha };
|
||||||
|
N = n >> 2;
|
||||||
|
for (i1 = 0; i1 < N; i1++)
|
||||||
|
{
|
||||||
|
BLASLONG i, j, temp;
|
||||||
|
FLOAT *CO;
|
||||||
|
FLOAT *AO;
|
||||||
|
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||||
|
off = offset;
|
||||||
|
#endif
|
||||||
|
CO = C;
|
||||||
|
C += ldc << 2;
|
||||||
|
AO = A;
|
||||||
|
PREFETCH1 (A, 128);
|
||||||
|
PREFETCH1 (A, 256);
|
||||||
|
i = m >> 4;
|
||||||
|
for (j = 0; j < i; j++)
|
||||||
|
{
|
||||||
|
FLOAT *BO;
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
REFRESH_POINTERS (16, 4);
|
||||||
|
#else
|
||||||
|
BO = B;
|
||||||
|
temp = k;
|
||||||
|
#endif
|
||||||
|
v4sf_t *rowC;
|
||||||
|
v4sf_t result[4];
|
||||||
|
BLASLONG l = 0;
|
||||||
|
PREFETCH1 (CO, 0);
|
||||||
|
PREFETCH1 (CO + ldc, 0);
|
||||||
|
PREFETCH1 (CO + ldc + ldc, 0);
|
||||||
|
PREFETCH1 (CO + ldc + ldc + ldc, 0);
|
||||||
|
PREFETCH1 (CO, 128);
|
||||||
|
PREFETCH1 (CO + ldc, 128);
|
||||||
|
PREFETCH1 (CO + ldc + ldc, 128);
|
||||||
|
PREFETCH1 (CO + ldc + ldc + ldc, 128);
|
||||||
|
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
|
||||||
|
SET_ACC_ZERO8 ();
|
||||||
|
for (l = 0; l < temp; l++)
|
||||||
|
{
|
||||||
|
vec_t *rowA = (vec_t *) & AO[l << 4];
|
||||||
|
__vector_pair rowB;
|
||||||
|
vec_t *rb = (vec_t *) & BO[l << 2];
|
||||||
|
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||||
|
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||||
|
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
|
||||||
|
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
|
||||||
|
__builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
|
||||||
|
__builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]);
|
||||||
|
__builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]);
|
||||||
|
__builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]);
|
||||||
|
__builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]);
|
||||||
|
}
|
||||||
|
SAVE_ACC (&acc0, 0);
|
||||||
|
SAVE_ACC (&acc2, 4);
|
||||||
|
SAVE_ACC (&acc1, 2);
|
||||||
|
SAVE_ACC (&acc3, 6);
|
||||||
|
SAVE_ACC (&acc4, 8);
|
||||||
|
SAVE_ACC (&acc6, 12);
|
||||||
|
SAVE_ACC (&acc5, 10);
|
||||||
|
SAVE_ACC (&acc7, 14);
|
||||||
|
AO += temp << 4;
|
||||||
|
BO += temp << 2;
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
REFRESH_AFTER_SAVE (16, 4)
|
||||||
|
#endif
|
||||||
|
CO += 16;
|
||||||
|
}
|
||||||
|
i = (m & 15) >> 3;
|
||||||
|
for (j = 0; j < i; j++)
|
||||||
|
{
|
||||||
|
FLOAT *BO;
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
REFRESH_POINTERS (8, 4);
|
||||||
|
#else
|
||||||
|
BO = B;
|
||||||
|
temp = k;
|
||||||
|
#endif
|
||||||
|
v4sf_t *rowC;
|
||||||
|
v4sf_t result[4];
|
||||||
|
__vector_quad acc0, acc1, acc2, acc3;
|
||||||
|
SET_ACC_ZERO4 ();
|
||||||
|
BLASLONG l = 0;
|
||||||
|
for (l = 0; l < temp; l++)
|
||||||
|
{
|
||||||
|
vec_t *rowA = (vec_t *) & AO[l << 3];
|
||||||
|
__vector_pair rowB;
|
||||||
|
vec_t *rb = (vec_t *) & BO[l << 2];
|
||||||
|
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||||
|
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||||
|
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
|
||||||
|
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
|
||||||
|
__builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
|
||||||
|
}
|
||||||
|
SAVE_ACC (&acc0, 0);
|
||||||
|
SAVE_ACC (&acc2, 4);
|
||||||
|
SAVE_ACC (&acc1, 2);
|
||||||
|
SAVE_ACC (&acc3, 6);
|
||||||
|
CO += 8;
|
||||||
|
AO += temp << 3;
|
||||||
|
BO += temp << 2;
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
REFRESH_AFTER_SAVE (8, 4)
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
i = (m & 7) >> 2;
|
||||||
|
for (j = 0; j < i; j++)
|
||||||
|
{
|
||||||
|
FLOAT *BO;
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
REFRESH_POINTERS (4, 4);
|
||||||
|
#else
|
||||||
|
BO = B;
|
||||||
|
temp = k;
|
||||||
|
#endif
|
||||||
|
v4sf_t *rowC;
|
||||||
|
v4sf_t result[4];
|
||||||
|
__vector_quad acc0, acc1;
|
||||||
|
__builtin_mma_xxsetaccz (&acc0);
|
||||||
|
__builtin_mma_xxsetaccz (&acc1);
|
||||||
|
BLASLONG l = 0;
|
||||||
|
for (l = 0; l < temp; l++)
|
||||||
|
{
|
||||||
|
vec_t *rowA = (vec_t *) & AO[l << 2];
|
||||||
|
__vector_pair rowB;
|
||||||
|
vec_t *rb = (vec_t *) & BO[l << 2];
|
||||||
|
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||||
|
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||||
|
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
|
||||||
|
}
|
||||||
|
SAVE_ACC (&acc0, 0);
|
||||||
|
SAVE_ACC (&acc1, 2);
|
||||||
|
CO += 4;
|
||||||
|
AO += temp << 2;
|
||||||
|
BO += temp << 2;
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
REFRESH_AFTER_SAVE (4, 4)
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
i = (m & 3) >> 1;
|
||||||
|
for (j = 0; j < i; j++)
|
||||||
|
{
|
||||||
|
FLOAT *BO;
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
REFRESH_POINTERS (2, 4);
|
||||||
|
#else
|
||||||
|
BO = B;
|
||||||
|
temp = k;
|
||||||
|
#endif
|
||||||
|
v4sf_t *rowC;
|
||||||
|
v4sf_t result[4];
|
||||||
|
__vector_quad acc0;
|
||||||
|
__builtin_mma_xxsetaccz (&acc0);
|
||||||
|
BLASLONG l = 0;
|
||||||
|
for (l = 0; l < temp; l++)
|
||||||
|
{
|
||||||
|
vec_t *rowA = (vec_t *) & AO[l << 1];
|
||||||
|
__vector_pair rowB;
|
||||||
|
vec_t *rb = (vec_t *) & BO[l << 2];
|
||||||
|
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||||
|
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||||
|
}
|
||||||
|
SAVE_ACC (&acc0, 0);
|
||||||
|
CO += 2;
|
||||||
|
AO += temp << 1;
|
||||||
|
BO += temp << 2;
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
REFRESH_AFTER_SAVE (2, 4)
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
i = (m & 1) >> 0;
|
||||||
|
for (j = 0; j < i; j++)
|
||||||
|
{
|
||||||
|
FLOAT *BO;
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
REFRESH_POINTERS (1, 4);
|
||||||
|
#else
|
||||||
|
BO = B;
|
||||||
|
temp = k;
|
||||||
|
#endif
|
||||||
|
BLASLONG l = 0;
|
||||||
|
v4sf_t t = { 0, 0 };
|
||||||
|
v4sf_t t1 = { 0, 0 };
|
||||||
|
for (l = 0; l < temp; l++)
|
||||||
|
{
|
||||||
|
v4sf_t rowA = { AO[l], AO[l] };
|
||||||
|
v4sf_t rowB = { BO[l << 2], BO[(l << 2) + 1] };
|
||||||
|
v4sf_t rowB1 = { BO[(l << 2) + 2], BO[(l << 2) + 3] };
|
||||||
|
t += rowA * rowB;
|
||||||
|
t1 += rowA * rowB1;
|
||||||
|
}
|
||||||
|
t = t * valpha;
|
||||||
|
t1 = t1 * valpha;
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
CO[0 * ldc] = t[0];
|
||||||
|
CO[1 * ldc] = t[1];
|
||||||
|
CO[2 * ldc] = t1[0];
|
||||||
|
CO[3 * ldc] = t1[1];
|
||||||
|
#else
|
||||||
|
CO[0 * ldc] += t[0];
|
||||||
|
CO[1 * ldc] += t[1];
|
||||||
|
CO[2 * ldc] += t1[0];
|
||||||
|
CO[3 * ldc] += t1[1];
|
||||||
|
#endif
|
||||||
|
CO += 1;
|
||||||
|
AO += temp;
|
||||||
|
BO += temp << 2;
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
REFRESH_AFTER_SAVE (1, 4)
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||||
|
off += 4; // number of values in A
|
||||||
|
#endif
|
||||||
|
B += k << 2;
|
||||||
|
}
|
||||||
|
N = (n & 3) >> 1;
|
||||||
|
for (i1 = 0; i1 < N; i1++)
|
||||||
|
{
|
||||||
|
BLASLONG i, j, temp;
|
||||||
|
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||||
|
off = offset;
|
||||||
|
#endif
|
||||||
|
FLOAT *CO;
|
||||||
|
FLOAT *AO;
|
||||||
|
CO = C;
|
||||||
|
C += ldc << 1;
|
||||||
|
AO = A;
|
||||||
|
i = m >> 4;
|
||||||
|
for (j = 0; j < i; j++)
|
||||||
|
{
|
||||||
|
FLOAT *BO;
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
REFRESH_POINTERS (16, 2);
|
||||||
|
#else
|
||||||
|
BO = B;
|
||||||
|
temp = k;
|
||||||
|
#endif
|
||||||
|
v4sf_t *rowC;
|
||||||
|
v4sf_t result[4];
|
||||||
|
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
|
||||||
|
SET_ACC_ZERO8 ();
|
||||||
|
BLASLONG l = 0;
|
||||||
|
for (l = 0; l < temp; l++)
|
||||||
|
{
|
||||||
|
FLOAT t[4] = { 0, 0, 0, 0 };
|
||||||
|
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
|
||||||
|
__vector_pair rowB;
|
||||||
|
vec_t *rb = (vec_t *) & t[0];
|
||||||
|
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||||
|
vec_t *rowA = (vec_t *) & AO[l << 4];
|
||||||
|
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||||
|
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
|
||||||
|
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
|
||||||
|
__builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
|
||||||
|
__builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]);
|
||||||
|
__builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]);
|
||||||
|
__builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]);
|
||||||
|
__builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]);
|
||||||
|
}
|
||||||
|
SAVE2x4_ACC (&acc0, 0);
|
||||||
|
SAVE2x4_ACC (&acc1, 2);
|
||||||
|
SAVE2x4_ACC (&acc2, 4);
|
||||||
|
SAVE2x4_ACC (&acc3, 6);
|
||||||
|
SAVE2x4_ACC (&acc4, 8);
|
||||||
|
SAVE2x4_ACC (&acc5, 10);
|
||||||
|
SAVE2x4_ACC (&acc6, 12);
|
||||||
|
SAVE2x4_ACC (&acc7, 14);
|
||||||
|
CO += 16;
|
||||||
|
AO += temp << 4;
|
||||||
|
BO += temp << 1;
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
REFRESH_AFTER_SAVE (16, 2)
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
i = (m & 15) >> 3;
|
||||||
|
for (j = 0; j < i; j++)
|
||||||
|
{
|
||||||
|
FLOAT *BO;
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
REFRESH_POINTERS (8, 2);
|
||||||
|
#else
|
||||||
|
BO = B;
|
||||||
|
temp = k;
|
||||||
|
#endif
|
||||||
|
v4sf_t *rowC;
|
||||||
|
v4sf_t result[4];
|
||||||
|
__vector_quad acc0, acc1, acc2, acc3;
|
||||||
|
SET_ACC_ZERO4 ();
|
||||||
|
BLASLONG l = 0;
|
||||||
|
for (l = 0; l < temp; l++)
|
||||||
|
{
|
||||||
|
FLOAT t[4] = { 0, 0, 0, 0 };
|
||||||
|
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
|
||||||
|
__vector_pair rowB;
|
||||||
|
vec_t *rb = (vec_t *) & t[0];
|
||||||
|
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||||
|
vec_t *rowA = (vec_t *) & AO[l << 3];
|
||||||
|
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||||
|
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
|
||||||
|
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
|
||||||
|
__builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
|
||||||
|
}
|
||||||
|
SAVE2x4_ACC (&acc0, 0);
|
||||||
|
SAVE2x4_ACC (&acc1, 2);
|
||||||
|
SAVE2x4_ACC (&acc2, 4);
|
||||||
|
SAVE2x4_ACC (&acc3, 6);
|
||||||
|
CO += 8;
|
||||||
|
AO += temp << 3;
|
||||||
|
BO += temp << 1;
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
REFRESH_AFTER_SAVE (8, 2)
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
i = (m & 7) >> 2;
|
||||||
|
for (j = 0; j < i; j++)
|
||||||
|
{
|
||||||
|
FLOAT *BO;
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
REFRESH_POINTERS (4, 2);
|
||||||
|
#else
|
||||||
|
BO = B;
|
||||||
|
temp = k;
|
||||||
|
#endif
|
||||||
|
v4sf_t *rowC;
|
||||||
|
v4sf_t result[4];
|
||||||
|
__vector_quad acc0, acc1;
|
||||||
|
__builtin_mma_xxsetaccz (&acc0);
|
||||||
|
__builtin_mma_xxsetaccz (&acc1);
|
||||||
|
BLASLONG l = 0;
|
||||||
|
for (l = 0; l < temp; l++)
|
||||||
|
{
|
||||||
|
FLOAT t[4] = { 0, 0, 0, 0 };
|
||||||
|
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
|
||||||
|
__vector_pair rowB;
|
||||||
|
vec_t *rb = (vec_t *) & t[0];
|
||||||
|
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||||
|
vec_t *rowA = (vec_t *) & AO[l << 2];
|
||||||
|
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||||
|
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
|
||||||
|
}
|
||||||
|
SAVE2x4_ACC (&acc0, 0);
|
||||||
|
SAVE2x4_ACC (&acc1, 2);
|
||||||
|
CO += 4;
|
||||||
|
AO += temp << 2;
|
||||||
|
BO += temp << 1;
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
REFRESH_AFTER_SAVE (4, 2)
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
i = (m & 3) >> 1;
|
||||||
|
for (j = 0; j < i; j++)
|
||||||
|
{
|
||||||
|
FLOAT *BO;
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
REFRESH_POINTERS (2, 2);
|
||||||
|
#else
|
||||||
|
BO = B;
|
||||||
|
temp = k;
|
||||||
|
#endif
|
||||||
|
v4sf_t *rowC;
|
||||||
|
v4sf_t result[4];
|
||||||
|
__vector_quad acc0;
|
||||||
|
__builtin_mma_xxsetaccz (&acc0);
|
||||||
|
BLASLONG l = 0;
|
||||||
|
for (l = 0; l < temp; l++)
|
||||||
|
{
|
||||||
|
FLOAT t[4] = { 0, 0, 0, 0 };
|
||||||
|
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
|
||||||
|
__vector_pair rowB;
|
||||||
|
vec_t *rb = (vec_t *) & t[0];
|
||||||
|
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||||
|
vec_t *rowA = (vec_t *) & AO[l << 1];
|
||||||
|
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||||
|
}
|
||||||
|
SAVE2x4_ACC (&acc0, 0);
|
||||||
|
CO += 2;
|
||||||
|
AO += temp << 1;
|
||||||
|
BO += temp << 1;
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
REFRESH_AFTER_SAVE (2, 2)
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
i = (m & 1) >> 0;
|
||||||
|
for (j = 0; j < i; j++)
|
||||||
|
{
|
||||||
|
FLOAT *BO;
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
REFRESH_POINTERS (1, 2);
|
||||||
|
#else
|
||||||
|
BO = B;
|
||||||
|
temp = k;
|
||||||
|
#endif
|
||||||
|
BLASLONG l = 0;
|
||||||
|
v4sf_t t = { 0, 0 };
|
||||||
|
for (l = 0; l < temp; l++)
|
||||||
|
{
|
||||||
|
v4sf_t rowA = { AO[l], AO[l] };
|
||||||
|
v4sf_t rowB = { BO[l << 1], BO[(l << 1) + 1] };
|
||||||
|
t += rowA * rowB;
|
||||||
|
}
|
||||||
|
t = t * valpha;
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
CO[0 * ldc] = t[0];
|
||||||
|
CO[1 * ldc] = t[1];
|
||||||
|
#else
|
||||||
|
CO[0 * ldc] += t[0];
|
||||||
|
CO[1 * ldc] += t[1];
|
||||||
|
#endif
|
||||||
|
CO += 1;
|
||||||
|
AO += temp;
|
||||||
|
BO += temp << 1;
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
REFRESH_AFTER_SAVE (1, 2)
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||||
|
off += 2; // number of values in A
|
||||||
|
#endif
|
||||||
|
B += k << 1;
|
||||||
|
}
|
||||||
|
N = (n & 1) >> 0;
|
||||||
|
for (i1 = 0; i1 < N; i1++)
|
||||||
|
{
|
||||||
|
BLASLONG i, temp;
|
||||||
|
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||||
|
off = offset;
|
||||||
|
#endif
|
||||||
|
FLOAT *CO;
|
||||||
|
FLOAT *AO;
|
||||||
|
CO = C;
|
||||||
|
C += ldc;
|
||||||
|
AO = A;
|
||||||
|
i = m;
|
||||||
|
while (i >= 16)
|
||||||
|
{
|
||||||
|
FLOAT *BO;
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
REFRESH_POINTERS (16, 1)
|
||||||
|
#else
|
||||||
|
BO = B;
|
||||||
|
temp = k;
|
||||||
|
#endif
|
||||||
|
BLASLONG l = 0;
|
||||||
|
v4sf_t t = { 0, 0 };
|
||||||
|
v4sf_t t1 = { 0, 0 };
|
||||||
|
v4sf_t t2 = { 0, 0 };
|
||||||
|
v4sf_t t3 = { 0, 0 };
|
||||||
|
v4sf_t t4 = { 0, 0 };
|
||||||
|
v4sf_t t5 = { 0, 0 };
|
||||||
|
v4sf_t t6 = { 0, 0 };
|
||||||
|
v4sf_t t7 = { 0, 0 };
|
||||||
|
for (l = 0; l < temp; l++)
|
||||||
|
{
|
||||||
|
v4sf_t rowB = { BO[l], BO[l] };
|
||||||
|
v4sf_t rowA = { AO[l << 4], AO[(l << 4) + 1] };
|
||||||
|
v4sf_t rowA1 = { AO[(l << 4) + 2], AO[(l << 4) + 3] };
|
||||||
|
v4sf_t rowA2 = { AO[(l << 4) + 4], AO[(l << 4) + 5] };
|
||||||
|
v4sf_t rowA3 = { AO[(l << 4) + 6], AO[(l << 4) + 7] };
|
||||||
|
v4sf_t rowA4 = { AO[(l << 4) + 8], AO[(l << 4) + 9] };
|
||||||
|
v4sf_t rowA5 = { AO[(l << 4) + 10], AO[(l << 4) + 11] };
|
||||||
|
v4sf_t rowA6 = { AO[(l << 4) + 12], AO[(l << 4) + 13] };
|
||||||
|
v4sf_t rowA7 = { AO[(l << 4) + 14], AO[(l << 4) + 15] };
|
||||||
|
t += rowA * rowB;
|
||||||
|
t1 += rowA1 * rowB;
|
||||||
|
t2 += rowA2 * rowB;
|
||||||
|
t3 += rowA3 * rowB;
|
||||||
|
t4 += rowA4 * rowB;
|
||||||
|
t5 += rowA5 * rowB;
|
||||||
|
t6 += rowA6 * rowB;
|
||||||
|
t7 += rowA7 * rowB;
|
||||||
|
}
|
||||||
|
t = t * valpha;
|
||||||
|
t1 = t1 * valpha;
|
||||||
|
t2 = t2 * valpha;
|
||||||
|
t3 = t3 * valpha;
|
||||||
|
t4 = t4 * valpha;
|
||||||
|
t5 = t5 * valpha;
|
||||||
|
t6 = t6 * valpha;
|
||||||
|
t7 = t7 * valpha;
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
CO[0] = t[0];
|
||||||
|
CO[1] = t[1];
|
||||||
|
CO[2] = t1[0];
|
||||||
|
CO[3] = t1[1];
|
||||||
|
CO[4] = t2[0];
|
||||||
|
CO[5] = t2[1];
|
||||||
|
CO[6] = t3[0];
|
||||||
|
CO[7] = t3[1];
|
||||||
|
CO[8] = t4[0];
|
||||||
|
CO[9] = t4[1];
|
||||||
|
CO[10] = t5[0];
|
||||||
|
CO[11] = t5[1];
|
||||||
|
CO[12] = t6[0];
|
||||||
|
CO[13] = t6[1];
|
||||||
|
CO[14] = t7[0];
|
||||||
|
CO[15] = t7[1];
|
||||||
|
#else
|
||||||
|
CO[0] += t[0];
|
||||||
|
CO[1] += t[1];
|
||||||
|
CO[2] += t1[0];
|
||||||
|
CO[3] += t1[1];
|
||||||
|
CO[4] += t2[0];
|
||||||
|
CO[5] += t2[1];
|
||||||
|
CO[6] += t3[0];
|
||||||
|
CO[7] += t3[1];
|
||||||
|
CO[8] += t4[0];
|
||||||
|
CO[9] += t4[1];
|
||||||
|
CO[10] += t5[0];
|
||||||
|
CO[11] += t5[1];
|
||||||
|
CO[12] += t6[0];
|
||||||
|
CO[13] += t6[1];
|
||||||
|
CO[14] += t7[0];
|
||||||
|
CO[15] += t7[1];
|
||||||
|
#endif
|
||||||
|
AO += temp << 4;
|
||||||
|
BO += temp;
|
||||||
|
CO += 16;
|
||||||
|
i -= 16;
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
REFRESH_AFTER_SAVE (16, 1)
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
while (i >= 8)
|
||||||
|
{
|
||||||
|
FLOAT *BO;
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
REFRESH_POINTERS (8, 1)
|
||||||
|
#else
|
||||||
|
BO = B;
|
||||||
|
temp = k;
|
||||||
|
#endif
|
||||||
|
BLASLONG l = 0;
|
||||||
|
v4sf_t t = { 0, 0 };
|
||||||
|
v4sf_t t1 = { 0, 0 };
|
||||||
|
v4sf_t t2 = { 0, 0 };
|
||||||
|
v4sf_t t3 = { 0, 0 };
|
||||||
|
for (l = 0; l < temp; l++)
|
||||||
|
{
|
||||||
|
v4sf_t rowB = { BO[l], BO[l] };
|
||||||
|
v4sf_t rowA = { AO[l << 3], AO[(l << 3) + 1] };
|
||||||
|
v4sf_t rowA1 = { AO[(l << 3) + 2], AO[(l << 3) + 3] };
|
||||||
|
v4sf_t rowA2 = { AO[(l << 3) + 4], AO[(l << 3) + 5] };
|
||||||
|
v4sf_t rowA3 = { AO[(l << 3) + 6], AO[(l << 3) + 7] };
|
||||||
|
t += rowA * rowB;
|
||||||
|
t1 += rowA1 * rowB;
|
||||||
|
t2 += rowA2 * rowB;
|
||||||
|
t3 += rowA3 * rowB;
|
||||||
|
}
|
||||||
|
t = t * valpha;
|
||||||
|
t1 = t1 * valpha;
|
||||||
|
t2 = t2 * valpha;
|
||||||
|
t3 = t3 * valpha;
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
CO[0] = t[0];
|
||||||
|
CO[1] = t[1];
|
||||||
|
CO[2] = t1[0];
|
||||||
|
CO[3] = t1[1];
|
||||||
|
CO[4] = t2[0];
|
||||||
|
CO[5] = t2[1];
|
||||||
|
CO[6] = t3[0];
|
||||||
|
CO[7] = t3[1];
|
||||||
|
#else
|
||||||
|
CO[0] += t[0];
|
||||||
|
CO[1] += t[1];
|
||||||
|
CO[2] += t1[0];
|
||||||
|
CO[3] += t1[1];
|
||||||
|
CO[4] += t2[0];
|
||||||
|
CO[5] += t2[1];
|
||||||
|
CO[6] += t3[0];
|
||||||
|
CO[7] += t3[1];
|
||||||
|
#endif
|
||||||
|
AO += temp << 3;
|
||||||
|
BO += temp;
|
||||||
|
CO += 8;
|
||||||
|
i -= 8;
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
REFRESH_AFTER_SAVE (8, 1)
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
while (i >= 4)
|
||||||
|
{
|
||||||
|
FLOAT *BO;
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
REFRESH_POINTERS (4, 1)
|
||||||
|
#else
|
||||||
|
BO = B;
|
||||||
|
temp = k;
|
||||||
|
#endif
|
||||||
|
BLASLONG l = 0;
|
||||||
|
v4sf_t t = { 0, 0 };
|
||||||
|
v4sf_t t1 = { 0, 0 };
|
||||||
|
for (l = 0; l < temp; l++)
|
||||||
|
{
|
||||||
|
v4sf_t rowB = { BO[l], BO[l] };
|
||||||
|
v4sf_t rowA = { AO[l << 2], AO[(l << 2) + 1] };
|
||||||
|
v4sf_t rowA1 = { AO[(l << 2) + 2], AO[(l << 2) + 3] };
|
||||||
|
t += rowA * rowB;
|
||||||
|
t1 += rowA1 * rowB;
|
||||||
|
}
|
||||||
|
t = t * valpha;
|
||||||
|
t1 = t1 * valpha;
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
CO[0] = t[0];
|
||||||
|
CO[1] = t[1];
|
||||||
|
CO[2] = t1[0];
|
||||||
|
CO[3] = t1[1];
|
||||||
|
#else
|
||||||
|
CO[0] += t[0];
|
||||||
|
CO[1] += t[1];
|
||||||
|
CO[2] += t1[0];
|
||||||
|
CO[3] += t1[1];
|
||||||
|
#endif
|
||||||
|
AO += temp << 2;
|
||||||
|
BO += temp;
|
||||||
|
CO += 4;
|
||||||
|
i -= 4;
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
REFRESH_AFTER_SAVE (4, 1)
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
while (i >= 2)
|
||||||
|
{
|
||||||
|
FLOAT *BO;
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
REFRESH_POINTERS (2, 1)
|
||||||
|
#else
|
||||||
|
BO = B;
|
||||||
|
temp = k;
|
||||||
|
#endif
|
||||||
|
BLASLONG l = 0;
|
||||||
|
v4sf_t t = { 0, 0 };
|
||||||
|
for (l = 0; l < temp; l++)
|
||||||
|
{
|
||||||
|
v4sf_t rowB = { BO[l], BO[l] };
|
||||||
|
v4sf_t rowA = { AO[l << 1], AO[(l << 1) + 1] };
|
||||||
|
t += rowA * rowB;
|
||||||
|
}
|
||||||
|
t = t * valpha;
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
CO[0] = t[0];
|
||||||
|
CO[1] = t[1];
|
||||||
|
#else
|
||||||
|
CO[0] += t[0];
|
||||||
|
CO[1] += t[1];
|
||||||
|
#endif
|
||||||
|
AO += temp << 1;
|
||||||
|
BO += temp;
|
||||||
|
CO += 2;
|
||||||
|
i -= 2;
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
REFRESH_AFTER_SAVE (2, 1)
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
while (i >= 1)
|
||||||
|
{
|
||||||
|
FLOAT *BO;
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
REFRESH_POINTERS (1, 1)
|
||||||
|
#else
|
||||||
|
BO = B;
|
||||||
|
temp = k;
|
||||||
|
#endif
|
||||||
|
BLASLONG l = 0;
|
||||||
|
FLOAT t = 0;
|
||||||
|
for (l = 0; l < temp; l++)
|
||||||
|
{
|
||||||
|
t += AO[l] * BO[l];
|
||||||
|
}
|
||||||
|
AO += temp;
|
||||||
|
BO += temp;
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
CO[0] = t * alpha;
|
||||||
|
#else
|
||||||
|
CO[0] += t * alpha;
|
||||||
|
#endif
|
||||||
|
CO += 1;
|
||||||
|
i -= 1;
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
REFRESH_AFTER_SAVE (1, 1)
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||||
|
off += 1; // number of values in A
|
||||||
|
#endif
|
||||||
|
B += k;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
|
|
||||||
#if defined(POWER8) || defined(POWER9)
|
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||||
#include "dgemv_n_microk_power8.c"
|
#include "dgemv_n_microk_power8.c"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#pragma GCC optimize "O1"
|
#pragma GCC optimize "O1"
|
||||||
|
|
||||||
#if defined(POWER8) || defined(POWER9)
|
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||||
#include "drot_microk_power8.c"
|
#include "drot_microk_power8.c"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#if defined(POWER8) || defined(POWER9)
|
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||||
#include "dscal_microk_power8.c"
|
#include "dscal_microk_power8.c"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue