diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml new file mode 100644 index 000000000..b6a4090bd --- /dev/null +++ b/.github/workflows/dynamic_arch.yml @@ -0,0 +1,81 @@ +name: continuous build + +on: [push, pull_request] + +jobs: + build: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, macos-latest] + build: [cmake, make] + steps: + - name: Checkout repository + uses: actions/checkout@v2 + + - name: Compilation cache + uses: actions/cache@v2 + with: + path: ~/.ccache + # We include the commit sha in the cache key, as new cache entries are + # only created if there is no existing entry for the key yet. + key: ${{ runner.os }}-ccache-${{ github.sha }} + # Restore any ccache cache entry, if none for + # ${{ runner.os }}-ccache-${{ github.sha }} exists + restore-keys: | + ${{ runner.os }}-ccache + + - name: Print system information + run: | + if [ "$RUNNER_OS" == "Linux" ]; then + cat /proc/cpuinfo + elif [ "$RUNNER_OS" == "macOS" ]; then + sysctl -a | grep machdep.cpu + else + echo "$RUNNER_OS not supported" + exit 1 + fi + + - name: Install Dependencies + run: | + if [ "$RUNNER_OS" == "Linux" ]; then + sudo apt-get install -y gfortran cmake ccache + elif [ "$RUNNER_OS" == "macOS" ]; then + brew install coreutils cmake ccache + else + echo "$RUNNER_OS not supported" + exit 1 + fi + ccache -M 300M # Limit the ccache size; Github's overall cache limit is 5GB + + - name: Build + if: matrix.build == 'make' + run: | + if [ "$RUNNER_OS" == "Linux" ]; then + export PATH="/usr/lib/ccache:${PATH}" + elif [ "$RUNNER_OS" == "macOS" ]; then + export PATH="$(brew --prefix)/opt/ccache/libexec:${PATH}" + else + echo "$RUNNER_OS not supported" + exit 1 + fi + + make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0 + + - name: CMake build + if: matrix.build == 'cmake' + run: | + if [ "$RUNNER_OS" == "Linux" ]; then + export PATH="/usr/lib/ccache:${PATH}" + elif [ "$RUNNER_OS" == "macOS" ]; then + export PATH="$(brew --prefix)/opt/ccache/libexec:${PATH}" + else + echo "$RUNNER_OS not supported" + exit 1 + fi + + mkdir build + cd build + cmake -DDYNAMIC_ARCH=1 -DNOFORTRAN=0 -DBUILD_WITHOUT_LAPACK=0 -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_BUILD_TYPE=Release .. + make -j$(nproc) diff --git a/.github/workflows/nightly-Homebrew-build.yml b/.github/workflows/nightly-Homebrew-build.yml index f55e73d23..8d7cfea2d 100644 --- a/.github/workflows/nightly-Homebrew-build.yml +++ b/.github/workflows/nightly-Homebrew-build.yml @@ -21,6 +21,7 @@ jobs: build-OpenBLAS-with-Homebrew: runs-on: macos-latest env: + DEVELOPER_DIR: /Applications/Xcode_11.4.1.app/Contents/Developer HOMEBREW_DEVELOPER: "ON" HOMEBREW_DISPLAY_INSTALL_TIMES: "ON" HOMEBREW_NO_ANALYTICS: "ON" diff --git a/.gitignore b/.gitignore index 6803a919e..bca79f043 100644 --- a/.gitignore +++ b/.gitignore @@ -70,6 +70,7 @@ test/SBLAT2.SUMM test/SBLAT3.SUMM test/ZBLAT2.SUMM test/ZBLAT3.SUMM +test/SHBLAT3.SUMM test/cblat1 test/cblat2 test/cblat3 @@ -79,6 +80,7 @@ test/dblat3 test/sblat1 test/sblat2 test/sblat3 +test/test_shgemm test/zblat1 test/zblat2 test/zblat3 diff --git a/.travis.yml b/.travis.yml index c875572b2..101147353 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,7 +16,6 @@ matrix: before_script: &common-before - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" script: - - set -e - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE - make -C test $COMMON_FLAGS $BTYPE - make -C ctest $COMMON_FLAGS $BTYPE @@ -108,7 +107,6 @@ matrix: - sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers' before_script: *common-before script: - - set -e # XXX: Disable some warnings for now to avoid exceeding Travis limit for log size. - alpine make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE CFLAGS="-Wno-misleading-indentation -Wno-sign-conversion -Wno-incompatible-pointer-types" @@ -151,7 +149,6 @@ matrix: before_script: - COMMON_ARGS="-DTARGET=NEHALEM -DNUM_THREADS=32" script: - - set -e - mkdir build - CONFIG=Release - cmake -Bbuild -H. $CMAKE_ARGS $COMMON_ARGS -DCMAKE_BUILD_TYPE=$CONFIG diff --git a/CMakeLists.txt b/CMakeLists.txt index 951271717..bb5322a1d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,8 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 9.dev) +set(OpenBLAS_PATCH_VERSION 10.dev) + set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions @@ -23,6 +24,7 @@ option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS fun option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF) option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF) option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF) +option(USE_LOCKING "Use locks even in single-threaded builds to make them callable from multiple threads" OFF) if(${CMAKE_SYSTEM_NAME} MATCHES "Linux") option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON) else() @@ -86,9 +88,13 @@ if (NOT NO_LAPACK) list(APPEND SUBDIRS lapack) endif () +if (NOT DEFINED BUILD_HALF) + set (BUILD_HALF false) +endif () # set which float types we want to build for if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16) # if none are defined, build for all +# set(BUILD_HALF true) set(BUILD_SINGLE true) set(BUILD_DOUBLE true) set(BUILD_COMPLEX true) @@ -120,6 +126,11 @@ if (BUILD_COMPLEX16) list(APPEND FLOAT_TYPES "ZCOMPLEX") # defines COMPLEX and DOUBLE endif () +if (BUILD_HALF) + message(STATUS "Building Half Precision") + list(APPEND FLOAT_TYPES "HALF") # defines nothing +endif () + if (NOT DEFINED CORE OR "${CORE}" STREQUAL "UNKNOWN") message(FATAL_ERROR "Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for details.") endif () @@ -234,7 +245,7 @@ if (BUILD_SHARED_LIBS AND BUILD_RELAPACK) if (NOT MSVC) target_link_libraries(${OpenBLAS_LIBNAME} "-Wl,-allow-multiple-definition") else() - target_link_libraries(${OpenBLAS_LIBNAME} "/FORCE:MULTIPLE") + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /FORCE:MULTIPLE") endif() endif() diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index df497c1d2..aba39e56f 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -180,3 +180,13 @@ In chronological order: * [2019-12-23] optimize AVX2 CGEMM and ZGEMM * [2019-12-30] AVX2 CGEMM3M & ZGEMM3M kernels * [2020-01-07] optimize AVX2 SGEMM and STRMM + +* Rajalakshmi Srinivasaraghavan + * [2020-04-15] Half-precision GEMM for bfloat16 + +* Marius Hillenbrand + * [2020-05-12] Revise dynamic architecture detection for IBM z + * [2020-05-12] Add new sgemm and strmm kernel for IBM z14 + +* Danfeng Zhang + * [2020-05-20] Improve performance of SGEMM and STRMM on Arm Cortex-A53 \ No newline at end of file diff --git a/Changelog.txt b/Changelog.txt index 5f924629b..cbf0b50f5 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,77 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.10 + 14-Jun-2020 + +common: + * Improved thread locking behaviour in blas_server and parallel getrf + * Imported bugfix 394 from LAPACK (spurious reference to "XERBL" + due to overlong lines) + * Imported bugfix 403 from LAPACK (compile option "recursive" required + for correctness with Intel and PGI) + * Imported bugfix 408 from LAPACK (wrong scaling in ZHEEQUB) + * Imported bugfix 411 from LAPACK (infinite loop in LARGV/LARTG/LARTGP) + * Fixed mismatches between BUFFERSIZE and GEMM_UNROLL parameters that + could lead to crashes at large matrix sizes + * Restored internal soname in dynamic libraries on FreeBSD and Dragonfly + * Added API (openblas_setaffinity) to set the thread affinity on Linux + * Added initial infrastructure for half-precision floating point + (bfloat16) support with a generic implementation of SHGEMM + * Added CMAKE build system support for building the cblas_Xgemm3m + functions + * Fixed CMAKE support for building in a path with embedded spaces + * Fixed CMAKE (non)handling of NO_EXPRECISION and MAX_STACK_ALLOC + * Fixed GCC version detection in the Makefiles + * Allowed overriding the names of AR, AS and LD in Makefile builds + +POWER: + * Fixed big-endian POWER8 ELFv2 builds on FreeBSD + * Fixed GCC version checks and DYNAMIC_ARCH builds on POWER9 + * Fixed CMAKE build support for POWER9 + * fixed a potential race condition in the thread buffer allocation + * Worked around LAPACK test failures on PPC G4 + +MIPS: + * Fixed a potential race condition in the thread buffer allocation + * Added support for MIPS 24K/24KE family based on P5600 kernels + +MIPS64: + * fixed a potential race condition in the thread buffer allocation + * Added TARGET=GENERIC + +ARMV7: + * Fixed a race condition in the thread buffer allocation + +ARMV8: + * Fixed a race condition in the thread buffer allocation + * Fixed zero initialisation in the assembly for SGEMM and DGEMM BETA + * Improved performance of the ThunderX2 DAXPY kernel + * Added an optimized SGEMM kernel for Cortex A53 + * Fixed Makefile support for INTERFACE64 (8-byte integer) + +x86_64: + * Fixed a syntax error in the CMAKE setup for SkylakeX + * Improved performance of STRSM on Haswell, SkylakeX and Ryzen + * Improved SGEMM performance on SGEMM for workloads with ldc a + multiple of 1024 + * Improved DGEMM performance on Skylake X + * Fixed unwanted AVX512-dependency of SGEMM in DYNAMIC_ARCH + builds created on SkylakeX + * Removed data alignment requirement in the SSE2 copy kernels + that could cause spurious crashes + * Added a workaround for an optimizer bug in AppleClang 11.0.3 + * Fixed LAPACK test failures due to wrong options for Intel Fortran + * Fixed compilation and LAPACK test results with recent Flang + and AMD AOCC + * Fixed DYNAMIC_ARCH builds with CMAKE on OS X + * Fixed missing exports of cblas_i?amin, cblas_i?min, cblas_i?max, + cblas_?sum, cblas_?gemm3m in the shared library on OS + * Fixed reporting of cpu name in DYNAMIC_ARCH builds (would sometimes + show the name of an older generation chip supported by the same kernels) + +IBM Z: + * Improved performance of SGEMM/STRMM and DGEMM/DTRMM on Z14 + ==================================================================== Version 0.3.9 1-Mar-2020 diff --git a/Makefile b/Makefile index 18320e6a3..e113026dd 100644 --- a/Makefile +++ b/Makefile @@ -264,6 +264,7 @@ lapack_prebuild : ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) -@echo "FC = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc -@echo "FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "FFLAGS_DRV = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc diff --git a/Makefile.power b/Makefile.power index 24d8aa8a7..5c431860f 100644 --- a/Makefile.power +++ b/Makefile.power @@ -9,6 +9,16 @@ else USE_OPENMP = 1 endif +ifeq ($(CORE), POWER10) +ifeq ($(USE_OPENMP), 1) +COMMON_OPT += -Ofast -mcpu=future -mtune=future -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp +FCOMMON_OPT += -O2 -frecursive -mcpu=future -mtune=future -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp +else +COMMON_OPT += -Ofast -mcpu=future -mtune=future -mvsx -malign-power -fno-fast-math +FCOMMON_OPT += -O2 -frecursive -mcpu=future -mtune=future -malign-power -fno-fast-math +endif +endif + ifeq ($(CORE), POWER9) ifeq ($(USE_OPENMP), 1) COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp diff --git a/Makefile.prebuild b/Makefile.prebuild index b00f13368..48fb5e991 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -17,7 +17,11 @@ ifdef CPUIDEMU EXFLAGS = -DCPUIDEMU -DVENDOR=99 endif -ifeq ($(TARGET), 1004K) +ifeq ($(TARGET), MIPS24K) +TARGET_FLAGS = -mips32r2 +endif + +ifeq ($(TARGET), MIPS1004K) TARGET_FLAGS = -mips32r2 endif diff --git a/Makefile.rule b/Makefile.rule index 724a60ec4..2c12177ee 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.9.dev +VERSION = 0.3.10.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library @@ -273,6 +273,9 @@ COMMON_PROF = -pg # # CPP_THREAD_SAFETY_TEST = 1 + +# If you want to enable the experimental BFLOAT16 support +# BUILD_HALF = 1 # # End of user configuration # diff --git a/Makefile.system b/Makefile.system index 2998c0e6a..2a0694d01 100644 --- a/Makefile.system +++ b/Makefile.system @@ -21,6 +21,8 @@ ifeq ($(ARCH), amd64) override ARCH=x86_64 else ifeq ($(ARCH), powerpc64) override ARCH=power +else ifeq ($(ARCH), powerpc) +override ARCH=power else ifeq ($(ARCH), i386) override ARCH=x86 else ifeq ($(ARCH), aarch64) @@ -261,10 +263,10 @@ endif ARFLAGS = CPP = $(COMPILER) -E -AR = $(CROSS_SUFFIX)ar -AS = $(CROSS_SUFFIX)as -LD = $(CROSS_SUFFIX)ld -RANLIB = $(CROSS_SUFFIX)ranlib +AR ?= $(CROSS_SUFFIX)ar +AS ?= $(CROSS_SUFFIX)as +LD ?= $(CROSS_SUFFIX)ld +RANLIB ?= $(CROSS_SUFFIX)ranlib NM = $(CROSS_SUFFIX)nm DLLWRAP = $(CROSS_SUFFIX)dllwrap OBJCOPY = $(CROSS_SUFFIX)objcopy @@ -277,6 +279,17 @@ NO_LAPACK = 1 override FEXTRALIB = endif +ifeq ($(C_COMPILER), GCC) +GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) +GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) +GCCVERSIONEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` = 5) +GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) +GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) +GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) +GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 2) +GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) +endif + # # OS dependent settings # @@ -323,13 +336,7 @@ ifeq ($(C_COMPILER), CLANG) CCOMMON_OPT += -DMS_ABI endif -ifeq ($(C_COMPILER), GCC) #Version tests for supporting specific features (MS_ABI, POWER9 intrinsics) -GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) -GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) -GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) -GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) -GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) ifeq ($(GCCVERSIONGT4), 1) # GCC Major version > 4 # It is compatible with MSVC ABI. @@ -343,7 +350,6 @@ ifeq ($(GCCMINORVERSIONGTEQ7), 1) CCOMMON_OPT += -DMS_ABI endif endif -endif # Ensure the correct stack alignment on Win32 # http://permalink.gmane.org/gmane.comp.lib.openblas.general/97 @@ -563,8 +569,34 @@ DYNAMIC_CORE += EMAG8180 endif ifeq ($(ARCH), zarch) -DYNAMIC_CORE = Z13 +DYNAMIC_CORE = ZARCH_GENERIC + +# Z13 is supported since gcc-5.2, gcc-6, and in RHEL 7.3 and newer +ifeq ($(GCCVERSIONGT5), 1) + ZARCH_SUPPORT_Z13 := 1 +else ifeq ($(GCCVERSIONEQ5), 1) +ifeq ($(GCCMINORVERSIONGTEQ2), 1) + ZARCH_SUPPORT_Z13 := 1 +endif +endif + +ifeq ($(wildcard /etc/redhat-release), /etc/redhat-release) +ifeq ($(shell source /etc/os-release ; expr $$VERSION_ID \>= "7.3"), 1) + ZARCH_SUPPORT_Z13 := 1 +endif +endif + +ifeq ($(ZARCH_SUPPORT_Z13), 1) +DYNAMIC_CORE += Z13 +else +$(info OpenBLAS: Not building Z13 kernels because gcc is older than 5.2 or 6.x) +endif + +ifeq ($(GCCVERSIONGTEQ7), 1) DYNAMIC_CORE += Z14 +else +$(info OpenBLAS: Not building Z14 kernels because gcc is older than 7.x) +endif endif ifeq ($(ARCH), power) @@ -572,14 +604,20 @@ DYNAMIC_CORE = POWER6 DYNAMIC_CORE += POWER8 ifneq ($(C_COMPILER), GCC) DYNAMIC_CORE += POWER9 +DYNAMIC_CORE += POWER10 endif ifeq ($(C_COMPILER), GCC) -GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) ifeq ($(GCCVERSIONGT5), 1) DYNAMIC_CORE += POWER9 else $(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.) endif +GCCVERSIONGTEQ11 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 11) +ifeq ($(GCCVERSIONGTEQ11), 1) +DYNAMIC_CORE += POWER10 +else +$(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.) +endif endif endif @@ -690,7 +728,12 @@ CCOMMON_OPT += -march=mips64 FCOMMON_OPT += -march=mips64 endif -ifeq ($(CORE), 1004K) +ifeq ($(CORE), MIPS24K) +CCOMMON_OPT += -mips32r2 -mtune=24kc $(MSA_FLAGS) +FCOMMON_OPT += -mips32r2 -mtune=24kc $(MSA_FLAGS) +endif + +ifeq ($(CORE), MIPS1004K) CCOMMON_OPT += -mips32r2 $(MSA_FLAGS) FCOMMON_OPT += -mips32r2 $(MSA_FLAGS) endif @@ -755,6 +798,15 @@ endif ifeq ($(F_COMPILER), FLANG) CCOMMON_OPT += -DF_INTERFACE_FLANG +FCOMMON_OPT += -Mrecursive -Kieee +ifeq ($(OSNAME), Linux) +ifeq ($(ARCH), x86_64) +FLANG_VENDOR := $(shell expr `$(FC) --version|cut -f 1 -d "."|head -1`) +ifeq ($(FLANG_VENDOR),AOCC) +FCOMMON_OPT += -fno-unroll-loops +endif +endif +endif ifdef BINARY64 ifdef INTERFACE64 ifneq ($(INTERFACE64), 0) @@ -850,7 +902,7 @@ ifneq ($(INTERFACE64), 0) FCOMMON_OPT += -i8 endif endif -FCOMMON_OPT += -recursive +FCOMMON_OPT += -recursive -fp-model strict -assume protect-parens ifeq ($(USE_OPENMP), 1) FCOMMON_OPT += -fopenmp endif @@ -1119,6 +1171,10 @@ ifeq ($(USE_TLS), 1) CCOMMON_OPT += -DUSE_TLS endif +ifeq ($(BUILD_HALF), 1) +CCOMMON_OPT += -DBUILD_HALF +endif + CCOMMON_OPT += -DVERSION=\"$(VERSION)\" ifndef SYMBOLPREFIX @@ -1145,6 +1201,7 @@ KERNELDIR = $(TOPDIR)/kernel/$(ARCH) include $(TOPDIR)/Makefile.$(ARCH) +CCOMMON_OPT += -UASMNAME -UASMFNAME -UNAME -UCNAME -UCHAR_NAME -UCHAR_CNAME CCOMMON_OPT += -DASMNAME=$(FU)$(*F) -DASMFNAME=$(FU)$(*F)$(BU) -DNAME=$(*F)$(BU) -DCNAME=$(*F) -DCHAR_NAME=\"$(*F)$(BU)\" -DCHAR_CNAME=\"$(*F)\" ifeq ($(CORE), PPC440) @@ -1237,7 +1294,6 @@ endif override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF) - override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF) #MAKEOVERRIDES = @@ -1344,6 +1400,7 @@ export ARCH export CORE export LIBCORE export __BYTE_ORDER__ +export ELF_VERSION export PGCPATH export CONFIG export CC @@ -1389,7 +1446,10 @@ export KERNELDIR export FUNCTION_PROFILE export TARGET_CORE export NO_AVX512 +export BUILD_HALF +export SHGEMM_UNROLL_M +export SHGEMM_UNROLL_N export SGEMM_UNROLL_M export SGEMM_UNROLL_N export DGEMM_UNROLL_M diff --git a/Makefile.tail b/Makefile.tail index 2adede1a5..39902982b 100644 --- a/Makefile.tail +++ b/Makefile.tail @@ -1,3 +1,4 @@ +SHBLASOBJS_P = $(SHBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) SBLASOBJS_P = $(SBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) DBLASOBJS_P = $(DBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) QBLASOBJS_P = $(QBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) @@ -9,8 +10,8 @@ COMMONOBJS_P = $(COMMONOBJS:.$(SUFFIX)=.$(PSUFFIX)) HPLOBJS_P = $(HPLOBJS:.$(SUFFIX)=.$(PSUFFIX)) -BLASOBJS = $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) -BLASOBJS_P = $(SBLASOBJS_P) $(DBLASOBJS_P) $(CBLASOBJS_P) $(ZBLASOBJS_P) +BLASOBJS = $(SHBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) +BLASOBJS_P = $(SHBLASOBJS_P) $(SBLASOBJS_P) $(DBLASOBJS_P) $(CBLASOBJS_P) $(ZBLASOBJS_P) ifdef EXPRECISION BLASOBJS += $(QBLASOBJS) $(XBLASOBJS) @@ -22,6 +23,7 @@ BLASOBJS += $(QBLASOBJS) $(XBLASOBJS) BLASOBJS_P += $(QBLASOBJS_P) $(XBLASOBJS_P) endif +$(SHBLASOBJS) $(SHBLASOBJS_P) : override CFLAGS += -DHALF -UDOUBLE -UCOMPLEX $(SBLASOBJS) $(SBLASOBJS_P) : override CFLAGS += -UDOUBLE -UCOMPLEX $(DBLASOBJS) $(DBLASOBJS_P) : override CFLAGS += -DDOUBLE -UCOMPLEX $(QBLASOBJS) $(QBLASOBJS_P) : override CFLAGS += -DXDOUBLE -UCOMPLEX @@ -29,6 +31,7 @@ $(CBLASOBJS) $(CBLASOBJS_P) : override CFLAGS += -UDOUBLE -DCOMPLEX $(ZBLASOBJS) $(ZBLASOBJS_P) : override CFLAGS += -DDOUBLE -DCOMPLEX $(XBLASOBJS) $(XBLASOBJS_P) : override CFLAGS += -DXDOUBLE -DCOMPLEX +$(SHBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) $(SBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) $(DBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) $(QBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) diff --git a/Makefile.zarch b/Makefile.zarch index 47ea1eb71..be1e34f6d 100644 --- a/Makefile.zarch +++ b/Makefile.zarch @@ -5,6 +5,6 @@ FCOMMON_OPT += -march=z13 -mzvector endif ifeq ($(CORE), Z14) -CCOMMON_OPT += -march=z14 -mzvector +CCOMMON_OPT += -march=z14 -mzvector -O3 FCOMMON_OPT += -march=z14 -mzvector endif diff --git a/README.md b/README.md index 61393bd8f..6dc3c7b42 100644 --- a/README.md +++ b/README.md @@ -122,6 +122,11 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th - **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations. - **AMD ZEN**: Uses Haswell codes with some optimizations. +#### MIPS32 + +- **MIPS 1004K**: uses P5600 codes +- **MIPS 24K**: uses P5600 codes + #### MIPS64 - **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2. diff --git a/TargetList.txt b/TargetList.txt index f4a40ed02..4e54e3077 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -49,6 +49,7 @@ POWER6 POWER7 POWER8 POWER9 +POWER10 PPCG4 PPC970 PPC970MP @@ -58,7 +59,8 @@ CELL 3.MIPS CPU: P5600 -1004K +MIPS1004K +MIPS24K 4.MIPS64 CPU: SICORTEX diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 9b4c85367..639cb3558 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -49,3 +49,23 @@ jobs: # we need a privileged docker run for sde process attachment docker run --privileged intel_sde displayName: 'Run AVX512 SkylakeX docker build / test' + +- job: Windows_cl + pool: + vmImage: 'windows-latest' + steps: + - task: CMake@1 + inputs: + workingDirectory: 'build' # Optional + cmakeArgs: '-G "Visual Studio 16 2019" ..' + - task: CMake@1 + inputs: + cmakeArgs: '--build . --config Release' + workingDirectory: 'build' + - script: | + cd build + cd utest + dir + openblas_utest.exe + + diff --git a/benchmark/Makefile b/benchmark/Makefile index 90d903ad7..2f70ceaf3 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -49,6 +49,12 @@ else GOTO_LAPACK_TARGETS= endif +ifeq ($(BUILD_HALF),1) +GOTO_HALF_TARGETS=shgemm.goto +else +GOTO_HALF_TARGETS= +endif + ifeq ($(OSNAME), WINNT) goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ @@ -91,7 +97,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \ ssymm.goto dsymm.goto csymm.goto zsymm.goto \ - saxpby.goto daxpby.goto caxpby.goto zaxpby.goto + saxpby.goto daxpby.goto caxpby.goto zaxpby.goto $(GOTO_HALF_TARGETS) acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ @@ -264,7 +270,7 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ samin.goto damin.goto camin.goto zamin.goto \ smin.goto dmin.goto \ saxpby.goto daxpby.goto caxpby.goto zaxpby.goto \ - snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) + snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) $(GOTO_HALF_TARGETS) acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ @@ -614,6 +620,11 @@ zcholesky.essl : zcholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Sgemm #################################################### +ifeq ($(BUILD_HALF),1) +shgemm.goto : shgemm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm +endif + sgemm.goto : sgemm.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -1814,7 +1825,7 @@ zsymv.veclib : zsymv.$(SUFFIX) ##################################### Sgeev #################################################### sgeev.goto : sgeev.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm sgeev.acml : sgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1830,7 +1841,7 @@ sgeev.veclib : sgeev.$(SUFFIX) ##################################### Dgeev #################################################### dgeev.goto : dgeev.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dgeev.acml : dgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1847,7 +1858,7 @@ dgeev.veclib : dgeev.$(SUFFIX) ##################################### Cgeev #################################################### cgeev.goto : cgeev.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm cgeev.acml : cgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1864,7 +1875,7 @@ cgeev.veclib : cgeev.$(SUFFIX) ##################################### Zgeev #################################################### zgeev.goto : zgeev.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zgeev.acml : zgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1880,7 +1891,7 @@ zgeev.veclib : zgeev.$(SUFFIX) ##################################### Sgetri #################################################### sgetri.goto : sgetri.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm sgetri.acml : sgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1896,7 +1907,7 @@ sgetri.veclib : sgetri.$(SUFFIX) ##################################### Dgetri #################################################### dgetri.goto : dgetri.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dgetri.acml : dgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1913,7 +1924,7 @@ dgetri.veclib : dgetri.$(SUFFIX) ##################################### Cgetri #################################################### cgetri.goto : cgetri.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm cgetri.acml : cgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1930,7 +1941,7 @@ cgetri.veclib : cgetri.$(SUFFIX) ##################################### Zgetri #################################################### zgetri.goto : zgetri.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zgetri.acml : zgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -2916,6 +2927,11 @@ ccholesky.$(SUFFIX) : cholesky.c zcholesky.$(SUFFIX) : cholesky.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +ifeq ($(BUILD_HALF),1) +shgemm.$(SUFFIX) : gemm.c + $(CC) $(CFLAGS) -c -DHALF -UCOMPLEX -UDOUBLE -o $(@F) $^ +endif + sgemm.$(SUFFIX) : gemm.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ diff --git a/benchmark/gemm.c b/benchmark/gemm.c index dd016a7c3..d2235330b 100644 --- a/benchmark/gemm.c +++ b/benchmark/gemm.c @@ -39,6 +39,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef DOUBLE #define GEMM BLASFUNC(dgemm) +#elif defined(HALF) +#define GEMM BLASFUNC(shgemm) #else #define GEMM BLASFUNC(sgemm) #endif @@ -120,7 +122,8 @@ static void *huge_malloc(BLASLONG size){ int main(int argc, char *argv[]){ - FLOAT *a, *b, *c; + IFLOAT *a, *b; + FLOAT *c; FLOAT alpha[] = {1.0, 0.0}; FLOAT beta [] = {0.0, 0.0}; char transa = 'N'; @@ -184,10 +187,10 @@ int main(int argc, char *argv[]){ k = to; } - if (( a = (FLOAT *)malloc(sizeof(FLOAT) * m * k * COMPSIZE)) == NULL) { + if (( a = (IFLOAT *)malloc(sizeof(IFLOAT) * m * k * COMPSIZE)) == NULL) { fprintf(stderr,"Out of Memory!!\n");exit(1); } - if (( b = (FLOAT *)malloc(sizeof(FLOAT) * k * n * COMPSIZE)) == NULL) { + if (( b = (IFLOAT *)malloc(sizeof(IFLOAT) * k * n * COMPSIZE)) == NULL) { fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( c = (FLOAT *)malloc(sizeof(FLOAT) * m * n * COMPSIZE)) == NULL) { @@ -199,10 +202,10 @@ int main(int argc, char *argv[]){ #endif for (i = 0; i < m * k * COMPSIZE; i++) { - a[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[i] = ((IFLOAT) rand() / (IFLOAT) RAND_MAX) - 0.5; } for (i = 0; i < k * n * COMPSIZE; i++) { - b[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + b[i] = ((IFLOAT) rand() / (IFLOAT) RAND_MAX) - 0.5; } for (i = 0; i < m * n * COMPSIZE; i++) { c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; diff --git a/benchmark/zdot.c b/benchmark/zdot.c index ed9d4d2e8..136135c9c 100644 --- a/benchmark/zdot.c +++ b/benchmark/zdot.c @@ -170,9 +170,11 @@ int main(int argc, char *argv[]){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } gettimeofday( &start, (struct timezone *)0); - +#ifdef RETURN_BY_STACK + DOT (&result , &m, x, &inc_x, y, &inc_y ); +#else result = DOT (&m, x, &inc_x, y, &inc_y ); - +#endif gettimeofday( &stop, (struct timezone *)0); time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; diff --git a/c_check b/c_check index c7899c84f..8234c2081 100644 --- a/c_check +++ b/c_check @@ -310,6 +310,7 @@ $linker_a = ""; && ($flags !~ /advapi32/) && ($flags !~ /shell32/) && ($flags !~ /omp/) + && ($flags !~ /[0-9]+/) ) { $linker_l .= $flags . " " } diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 9d51f777c..d56ba99cb 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -49,7 +49,7 @@ if (DYNAMIC_ARCH) endif () if (POWER) - set(DYNAMIC_CORE POWER6 POWER8 POWER9) + set(DYNAMIC_CORE POWER6 POWER8 POWER9 POWER10) endif () if (X86) diff --git a/cmake/fc.cmake b/cmake/fc.cmake index cc330ae2c..fc1f9bb22 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -16,6 +16,7 @@ if (${F_COMPILER} STREQUAL "FLANG") if (USE_OPENMP) set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp") endif () + set(FCOMMON_OPT "${FCOMMON_OPT} -Mrecursive -Kieee") endif () if (${F_COMPILER} STREQUAL "G77") diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake index 9b238f004..4b505a102 100644 --- a/cmake/kernel.cmake +++ b/cmake/kernel.cmake @@ -113,11 +113,31 @@ macro(SetDefaultL1) set(ZSUMKERNEL zsum.S) set(QSUMKERNEL sum.S) set(XSUMKERNEL zsum.S) +if (BUILD_HALF) + set(SHAMINKERNEL ../arm/amin.c) + set(SHAMAXKERNEL ../arm/amax.c) + set(SHMAXKERNEL ../arm/max.c) + set(SHMINKERNEL ../arm/min.c) + set(ISHAMAXKERNEL ../arm/iamax.c) + set(ISHAMINKERNEL ../arm/iamin.c) + set(ISHMAXKERNEL ../arm/imax.c) + set(ISHMINKERNEL ../arm/imin.c) + set(SHASUMKERNEL ../arm/asum.c) + set(SHAXPYKERNEL ../arm/axpy.c) + set(SHAXPBYKERNEL ../arm/axpby.c) + set(SHCOPYKERNEL ../arm/copy.c) + set(SHDOTKERNEL ../arm/dot.c) + set(SHROTKERNEL ../arm/rot.c) + set(SHSCALKERNEL ../arm/scal.c) + set(SHNRM2KERNEL ../arm/nrm2.c) + set(SHSUMKERNEL ../arm/sum.c) + set(SHSWAPKERNEL ../arm/swap.c) +endif () endmacro () macro(SetDefaultL2) - set(SGEMVNKERNEL gemv_n.S) - set(SGEMVTKERNEL gemv_t.S) + set(SGEMVNKERNEL ../arm/gemv_n.c) + set(SGEMVTKERNEL ../arm/gemv_t.c) set(DGEMVNKERNEL gemv_n.S) set(DGEMVTKERNEL gemv_t.S) set(CGEMVNKERNEL zgemv_n.S) @@ -161,6 +181,11 @@ macro(SetDefaultL2) set(XHEMV_L_KERNEL ../generic/zhemv_k.c) set(XHEMV_V_KERNEL ../generic/zhemv_k.c) set(XHEMV_M_KERNEL ../generic/zhemv_k.c) +if (BUILD_HALF) + set(SHGEMVNKERNEL ../arm/gemv_n.c) + set(SHGEMVTKERNEL ../arm/gemv_t.c) + set(SHGERKERNEL ../generic/ger.c) +endif () endmacro () macro(SetDefaultL3) @@ -168,4 +193,18 @@ macro(SetDefaultL3) set(DGEADD_KERNEL ../generic/geadd.c) set(CGEADD_KERNEL ../generic/zgeadd.c) set(ZGEADD_KERNEL ../generic/zgeadd.c) +if (BUILD_HALF) + set(SHGEADD_KERNEL ../generic/geadd.c) + set(SHGEMMKERNEL ../generic/gemmkernel_2x2.c) + set(SHGEMM_BETA ../generic/gemm_beta.c) + set(SHGEMMINCOPY ../generic/gemm_ncopy_2.c) + set(SHGEMMITCOPY ../generic/gemm_tcopy_2.c) + set(SHGEMMONCOPY ../generic/gemm_ncopy_2.c) + set(SHGEMMOTCOPY ../generic/gemm_tcopy_2.c) + set(SHGEMMINCOPYOBJ shgemm_incopy.o) + set(SHGEMMITCOPYOBJ shgemm_itcopy.o) + set(SHGEMMONCOPYOBJ shgemm_oncopy.o) + set(SHGEMMOTCOPYOBJ shgemm_otcopy.o) +endif () + endmacro () diff --git a/cmake/os.cmake b/cmake/os.cmake index 2d25e7aaa..c644bc3f7 100644 --- a/cmake/os.cmake +++ b/cmake/os.cmake @@ -8,7 +8,7 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux") set(NO_EXPRECISION 1) endif () -if (${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD|OpenBSD|NetBSD|DragonFly") +if (${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD|OpenBSD|NetBSD|DragonFly|Darwin") set(EXTRALIB "${EXTRALIB} -lm") set(NO_EXPRECISION 1) endif () diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 44e1473d1..30256870c 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -16,6 +16,8 @@ # HAVE_SSE2 # HAVE_SSE3 # MAKE +# SHGEMM_UNROLL_M +# SHGEMM_UNROLL_N # SGEMM_UNROLL_M # SGEMM_UNROLL_N # DGEMM_UNROLL_M @@ -418,7 +420,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(ZGEMM_UNROLL_M 8) set(ZGEMM_UNROLL_N 2) set(SYMV_P 8) - elseif ("${TCORE}" STREQUAL "POWER9") + elseif ("${TCORE}" STREQUAL "POWER9" OR "${TCORE}" STREQUAL "POWER10") file(APPEND ${TARGET_CONF_TEMP} "#define L1_DATA_SIZE 32768\n" "#define L1_DATA_LINESIZE 128\n" @@ -437,6 +439,8 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(ZGEMM_UNROLL_N 2) set(SYMV_P 8) endif() + set(SHGEMM_UNROLL_M 8) + set(SHGEMM_UNROLL_N 4) # Or should this actually be NUM_CORES? if (${NUM_THREADS} GREATER 0) @@ -488,7 +492,7 @@ else(NOT CMAKE_CROSSCOMPILING) if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") try_compile(GETARCH_RESULT ${GETARCH_DIR} SOURCES ${GETARCH_SRC} - COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${GETARCH_DIR} -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}" + COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I"${GETARCH_DIR}" -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}" OUTPUT_VARIABLE GETARCH_LOG COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN} ) @@ -516,7 +520,7 @@ execute_process(COMMAND "${PROJECT_BINARY_DIR}/${GETARCH_BIN}" 1 OUTPUT_VARIABLE if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") try_compile(GETARCH2_RESULT ${GETARCH2_DIR} SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c - COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${GETARCH2_DIR} -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}" + COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I"${GETARCH2_DIR}" -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}" OUTPUT_VARIABLE GETARCH2_LOG COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} ) diff --git a/cmake/system.cmake b/cmake/system.cmake index ce980a7b9..d8dcc3cf3 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -297,6 +297,16 @@ if (USE_SIMPLE_THREADED_LEVEL3) set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_SIMPLE_THREADED_LEVEL3") endif () +if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows") +if (DEFINED MAX_STACK_ALLOC) +if (NOT ${MAX_STACK_ALLOC} EQUAL 0) +set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_STACK_ALLOC=${MAX_STACK_ALLOC}") +endif () +else () +set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_STACK_ALLOC=2048") +endif () +endif () + if (DEFINED LIBNAMESUFFIX) set(LIBPREFIX "libopenblas_${LIBNAMESUFFIX}") else () @@ -407,6 +417,14 @@ if (${CMAKE_C_COMPILER} STREQUAL "LSB" OR ${CMAKE_SYSTEM_NAME} STREQUAL "Windows set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DLAPACK_COMPLEX_STRUCTURE") endif () +if ("${CMAKE_BUILD_TYPE}" STREQUAL "Release") +if ("${F_COMPILER}" STREQUAL "FLANG") +if (${CMAKE_Fortran_COMPILER_VERSION} VERSION_LESS_EQUAL 3) + set(CMAKE_Fortran_FLAGS_RELEASE "${CMAKE_Fortran_FLAGS_RELEASE} -fno-unroll-loops") +endif () +endif () +endif () + if (NOT DEFINED SUFFIX) set(SUFFIX o) endif () @@ -530,6 +548,8 @@ endif () #export FUNCTION_PROFILE #export TARGET_CORE # +#export SHGEMM_UNROLL_M +#export SHGEMM_UNROLL_N #export SGEMM_UNROLL_M #export SGEMM_UNROLL_N #export DGEMM_UNROLL_M diff --git a/cmake/utils.cmake b/cmake/utils.cmake index fd93f8a70..1c21e776e 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -15,12 +15,36 @@ endfunction () # Reads a Makefile into CMake vars. macro(ParseMakefileVars MAKEFILE_IN) message(STATUS "Reading vars from ${MAKEFILE_IN}...") + set (IfElse 0) + set (ElseSeen 0) file(STRINGS ${MAKEFILE_IN} makefile_contents) foreach (makefile_line ${makefile_contents}) +#message(STATUS "parsing ${makefile_line}") + if (${IfElse} GREATER 0) + string(REGEX MATCH "endif[ \t]*" line_match "${makefile_line}") + if (NOT "${line_match}" STREQUAL "") +# message(STATUS "ENDIF ${makefile_line}") + set (IfElse 0) + set (ElseSeen 0) + continue () + endif () + string(REGEX MATCH "else[ \t]*" line_match "${makefile_line}") + if (NOT "${line_match}" STREQUAL "") +# message(STATUS "ELSE ${makefile_line}") + set (ElseSeen 1) + continue () + endif() + if ( (${IfElse} EQUAL 2 AND ${ElseSeen} EQUAL 0) OR ( ${IfElse} EQUAL 1 AND ${ElseSeen} EQUAL 1)) +# message(STATUS "skipping ${makefile_line}") + continue () + endif () + endif () string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*(.+)$" line_match "${makefile_line}") if (NOT "${line_match}" STREQUAL "") +#message(STATUS "match on ${line_match}") set(var_name ${CMAKE_MATCH_1}) - set(var_value ${CMAKE_MATCH_2}) +# set(var_value ${CMAKE_MATCH_2}) + string(STRIP ${CMAKE_MATCH_2} var_value) # check for Makefile variables in the string, e.g. $(TSUFFIX) string(REGEX MATCHALL "\\$\\(([0-9_a-zA-Z]+)\\)" make_var_matches ${var_value}) foreach (make_var ${make_var_matches}) @@ -33,7 +57,31 @@ macro(ParseMakefileVars MAKEFILE_IN) else () string(REGEX MATCH "include \\$\\(KERNELDIR\\)/(.+)$" line_match "${makefile_line}") if (NOT "${line_match}" STREQUAL "") +#message(STATUS "match on include ${line_match}") ParseMakefileVars(${KERNELDIR}/${CMAKE_MATCH_1}) + else () +# message(STATUS "unmatched line ${line_match}") + string(REGEX MATCH "ifeq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}") + if (NOT "${line_match}" STREQUAL "") +# message(STATUS "IFEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}") + if (DEFINED ${${CMAKE_MATCH_1}} AND ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2}) +# message (STATUS "condition is true") + set (IfElse 1) + else () + set (IfElse 2) + endif () + else () + string(REGEX MATCH "ifneq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}") + if (NOT "${line_match}" STREQUAL "") +# message(STATUS "IFNEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}") + if (NOT ( ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2})) +# message (STATUS "condition is true") + set (IfElse 1) + else () + set (IfElse 2) + endif () + endif () + endif () endif () endif () endforeach () @@ -163,6 +211,7 @@ function(GenerateNamedObjects sources_in) if (complex_only) list(REMOVE_ITEM float_list "SINGLE") list(REMOVE_ITEM float_list "DOUBLE") + list(REMOVE_ITEM float_list "HALF") elseif (real_only) list(REMOVE_ITEM float_list "COMPLEX") list(REMOVE_ITEM float_list "ZCOMPLEX") @@ -176,6 +225,9 @@ function(GenerateNamedObjects sources_in) if (NOT no_float_type) string(SUBSTRING ${float_type} 0 1 float_char) string(TOLOWER ${float_char} float_char) + if (${float_type} STREQUAL "HALF") + set (float_char "sh") + endif () endif () if (NOT name_in) @@ -210,6 +262,9 @@ function(GenerateNamedObjects sources_in) if (${float_type} STREQUAL "DOUBLE" OR ${float_type} STREQUAL "ZCOMPLEX") list(APPEND obj_defines "DOUBLE") endif () + if (${float_type} STREQUAL "HALF") + list(APPEND obj_defines "HALF") + endif () if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") list(APPEND obj_defines "COMPLEX") if (mangle_complex_sources) diff --git a/common.h b/common.h index 762968e6f..00b34a3f7 100644 --- a/common.h +++ b/common.h @@ -257,6 +257,11 @@ typedef long BLASLONG; typedef unsigned long BLASULONG; #endif +#ifndef BFLOAT16 +typedef unsigned short bfloat16; +#define HALFCONVERSION 1 +#endif + #ifdef USE64BITINT typedef BLASLONG blasint; #if defined(OS_WINDOWS) && defined(__64BIT__) @@ -297,6 +302,13 @@ typedef int blasint; #define SIZE 8 #define BASE_SHIFT 3 #define ZBASE_SHIFT 4 +#elif defined(HALF) +#define IFLOAT bfloat16 +#define XFLOAT IFLOAT +#define FLOAT float +#define SIZE 2 +#define BASE_SHIFT 1 +#define ZBASE_SHIFT 2 #else #define FLOAT float #define SIZE 4 @@ -308,6 +320,10 @@ typedef int blasint; #define XFLOAT FLOAT #endif +#ifndef IFLOAT +#define IFLOAT FLOAT +#endif + #ifndef COMPLEX #define COMPSIZE 1 #else @@ -344,13 +360,8 @@ typedef int blasint; #endif #endif -#ifdef POWER8 -#ifndef YIELDING -#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); -#endif -#endif -#ifdef POWER9 +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #ifndef YIELDING #define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); #endif diff --git a/common_interface.h b/common_interface.h index c350ac8ec..78f5be6b0 100644 --- a/common_interface.h +++ b/common_interface.h @@ -469,6 +469,8 @@ void BLASFUNC(xhbmv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint /* Level 3 routines */ +void BLASFUNC(shgemm)(char *, char *, blasint *, blasint *, blasint *, float *, + bfloat16 *, blasint *, bfloat16 *, blasint *, float *, float *, blasint *); void BLASFUNC(sgemm)(char *, char *, blasint *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); void BLASFUNC(dgemm)(char *, char *, blasint *, blasint *, blasint *, double *, diff --git a/common_level3.h b/common_level3.h index 6fa902be8..4e44a5e73 100644 --- a/common_level3.h +++ b/common_level3.h @@ -55,6 +55,8 @@ extern void sgemm_kernel_direct(BLASLONG M, BLASLONG N, BLASLONG K, extern int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K); +int shgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, + bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG); int sgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int dgemm_beta(BLASLONG, BLASLONG, BLASLONG, double, @@ -76,6 +78,10 @@ int xgemm_beta(BLASLONG, BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); #endif +int shgemm_incopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); +int shgemm_itcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); +int shgemm_oncopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); +int shgemm_otcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); int sgemm_incopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); int sgemm_itcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); int sgemm_oncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); @@ -499,6 +505,7 @@ int xher2k_kernel_UC(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdoubl int xher2k_kernel_LN(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag); int xher2k_kernel_LC(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag); +int shgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG); int sgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); int dgemm_kernel(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG); @@ -527,6 +534,11 @@ int cgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float int zgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); int xgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); +int shgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int shgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int shgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int shgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); + int sgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int sgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int sgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); @@ -619,6 +631,11 @@ int xgemm_cr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLON int xgemm_cc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); #endif +int shgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int shgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int shgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int shgemm_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); + int sgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int sgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int sgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); diff --git a/common_macro.h b/common_macro.h index 13bb85794..8fe1f156f 100644 --- a/common_macro.h +++ b/common_macro.h @@ -39,6 +39,7 @@ #ifndef COMMON_MACRO #define COMMON_MACRO +#include "common_sh.h" #include "common_s.h" #include "common_d.h" #include "common_q.h" @@ -642,6 +643,288 @@ #define IMATCOPY_K_RT DIMATCOPY_K_RT #define GEADD_K DGEADD_K + +#elif defined(HALF) + +#define AMAX_K SAMAX_K +#define AMIN_K SAMIN_K +#define MAX_K SMAX_K +#define MIN_K SMIN_K +#define IAMAX_K ISAMAX_K +#define IAMIN_K ISAMIN_K +#define IMAX_K ISMAX_K +#define IMIN_K ISMIN_K +#define ASUM_K SASUM_K +#define DOTU_K SDOTU_K +#define DOTC_K SDOTC_K +#define AXPYU_K SAXPYU_K +#define AXPYC_K SAXPYC_K +#define AXPBY_K SAXPBY_K +#define SCAL_K SSCAL_K +#define GEMV_N SGEMV_N +#define GEMV_T SGEMV_T +#define SYMV_U SSYMV_U +#define SYMV_L SSYMV_L +#define GERU_K SGERU_K +#define GERC_K SGERC_K +#define GERV_K SGERV_K +#define GERD_K SGERD_K +#define SUM_K SSUM_K +#define SWAP_K SSWAP_K +#define ROT_K SROT_K +#define COPY_K SCOPY_K +#define NRM2_K SNRM2_K +#define SYMV_THREAD_U SSYMV_THREAD_U +#define SYMV_THREAD_L SSYMV_THREAD_L +#define GEMM_BETA SHGEMM_BETA +#define GEMM_KERNEL_N SHGEMM_KERNEL +#define GEMM_KERNEL_L SHGEMM_KERNEL +#define GEMM_KERNEL_R SHGEMM_KERNEL +#define GEMM_KERNEL_B SHGEMM_KERNEL + +#define GEMM_NN SHGEMM_NN +#define GEMM_CN SHGEMM_TN +#define GEMM_TN SHGEMM_TN +#define GEMM_NC SHGEMM_NT +#define GEMM_NT SHGEMM_NT +#define GEMM_CC SHGEMM_TT +#define GEMM_CT SHGEMM_TT +#define GEMM_TC SHGEMM_TT +#define GEMM_TT SHGEMM_TT +#define GEMM_NR SHGEMM_NN +#define GEMM_TR SHGEMM_TN +#define GEMM_CR SHGEMM_TN +#define GEMM_RN SHGEMM_NN +#define GEMM_RT SHGEMM_NT +#define GEMM_RC SHGEMM_NT +#define GEMM_RR SHGEMM_NN +#define GEMM_ONCOPY SHGEMM_ONCOPY +#define GEMM_OTCOPY SHGEMM_OTCOPY +#define GEMM_INCOPY SHGEMM_INCOPY +#define GEMM_ITCOPY SHGEMM_ITCOPY +#define SYMM_THREAD_LU SSYMM_THREAD_LU +#define SYMM_THREAD_LL SSYMM_THREAD_LL +#define SYMM_THREAD_RU SSYMM_THREAD_RU +#define SYMM_THREAD_RL SSYMM_THREAD_RL +#define SYMM_LU SSYMM_LU +#define SYMM_LL SSYMM_LL +#define SYMM_RU SSYMM_RU +#define SYMM_RL SSYMM_RL + + +#define HEMM_THREAD_LU SHEMM_THREAD_LU +#define HEMM_THREAD_LL SHEMM_THREAD_LL +#define HEMM_THREAD_RU SHEMM_THREAD_RU +#define HEMM_THREAD_RL SHEMM_THREAD_RL + +#define GEMM_THREAD_NN SHGEMM_THREAD_NN +#define GEMM_THREAD_CN SHGEMM_THREAD_TN +#define GEMM_THREAD_TN SHGEMM_THREAD_TN +#define GEMM_THREAD_NC SHGEMM_THREAD_NT +#define GEMM_THREAD_NT SHGEMM_THREAD_NT +#define GEMM_THREAD_CC SHGEMM_THREAD_TT +#define GEMM_THREAD_CT SHGEMM_THREAD_TT +#define GEMM_THREAD_TC SHGEMM_THREAD_TT +#define GEMM_THREAD_TT SHGEMM_THREAD_TT +#define GEMM_THREAD_NR SHGEMM_THREAD_NN +#define GEMM_THREAD_TR SHGEMM_THREAD_TN +#define GEMM_THREAD_CR SHGEMM_THREAD_TN +#define GEMM_THREAD_RN SHGEMM_THREAD_NN +#define GEMM_THREAD_RT SHGEMM_THREAD_NT +#define GEMM_THREAD_RC SHGEMM_THREAD_NT +#define GEMM_THREAD_RR SHGEMM_THREAD_NN + +#ifdef UNIT + +#define TRMM_OUNCOPY STRMM_OUNUCOPY +#define TRMM_OUTCOPY STRMM_OUTUCOPY +#define TRMM_OLNCOPY STRMM_OLNUCOPY +#define TRMM_OLTCOPY STRMM_OLTUCOPY +#define TRSM_OUNCOPY STRSM_OUNUCOPY +#define TRSM_OUTCOPY STRSM_OUTUCOPY +#define TRSM_OLNCOPY STRSM_OLNUCOPY +#define TRSM_OLTCOPY STRSM_OLTUCOPY + +#define TRMM_IUNCOPY STRMM_IUNUCOPY +#define TRMM_IUTCOPY STRMM_IUTUCOPY +#define TRMM_ILNCOPY STRMM_ILNUCOPY +#define TRMM_ILTCOPY STRMM_ILTUCOPY +#define TRSM_IUNCOPY STRSM_IUNUCOPY +#define TRSM_IUTCOPY STRSM_IUTUCOPY +#define TRSM_ILNCOPY STRSM_ILNUCOPY +#define TRSM_ILTCOPY STRSM_ILTUCOPY + +#else + +#define TRMM_OUNCOPY STRMM_OUNNCOPY +#define TRMM_OUTCOPY STRMM_OUTNCOPY +#define TRMM_OLNCOPY STRMM_OLNNCOPY +#define TRMM_OLTCOPY STRMM_OLTNCOPY +#define TRSM_OUNCOPY STRSM_OUNNCOPY +#define TRSM_OUTCOPY STRSM_OUTNCOPY +#define TRSM_OLNCOPY STRSM_OLNNCOPY +#define TRSM_OLTCOPY STRSM_OLTNCOPY + +#define TRMM_IUNCOPY STRMM_IUNNCOPY +#define TRMM_IUTCOPY STRMM_IUTNCOPY +#define TRMM_ILNCOPY STRMM_ILNNCOPY +#define TRMM_ILTCOPY STRMM_ILTNCOPY +#define TRSM_IUNCOPY STRSM_IUNNCOPY +#define TRSM_IUTCOPY STRSM_IUTNCOPY +#define TRSM_ILNCOPY STRSM_ILNNCOPY +#define TRSM_ILTCOPY STRSM_ILTNCOPY + +#define TRMM_KERNEL_LN STRMM_KERNEL_LN +#define TRMM_KERNEL_LT STRMM_KERNEL_LT +#define TRMM_KERNEL_LR STRMM_KERNEL_LN +#define TRMM_KERNEL_LC STRMM_KERNEL_LT +#define TRMM_KERNEL_RN STRMM_KERNEL_RN +#define TRMM_KERNEL_RT STRMM_KERNEL_RT +#define TRMM_KERNEL_RR STRMM_KERNEL_RN +#define TRMM_KERNEL_RC STRMM_KERNEL_RT + +#define TRSM_KERNEL_LN STRSM_KERNEL_LN +#define TRSM_KERNEL_LT STRSM_KERNEL_LT +#define TRSM_KERNEL_LR STRSM_KERNEL_LN +#define TRSM_KERNEL_LC STRSM_KERNEL_LT +#define TRSM_KERNEL_RN STRSM_KERNEL_RN +#define TRSM_KERNEL_RT STRSM_KERNEL_RT +#define TRSM_KERNEL_RR STRSM_KERNEL_RN +#define TRSM_KERNEL_RC STRSM_KERNEL_RT + +#define SYMM_IUTCOPY SSYMM_IUTCOPY +#define SYMM_ILTCOPY SSYMM_ILTCOPY +#define SYMM_OUTCOPY SSYMM_OUTCOPY +#define SYMM_OLTCOPY SSYMM_OLTCOPY +#define TRMM_LNUU STRMM_LNUU +#define TRMM_LNUN STRMM_LNUN +#define TRMM_LNLU STRMM_LNLU +#define TRMM_LNLN STRMM_LNLN +#define TRMM_LTUU STRMM_LTUU +#define TRMM_LTUN STRMM_LTUN +#define TRMM_LTLU STRMM_LTLU +#define TRMM_LTLN STRMM_LTLN +#define TRMM_LRUU STRMM_LNUU +#define TRMM_LRUN STRMM_LNUN +#define TRMM_LRLU STRMM_LNLU +#define TRMM_LRLN STRMM_LNLN +#define TRMM_LCUU STRMM_LTUU +#define TRMM_LCUN STRMM_LTUN +#define TRMM_LCLU STRMM_LTLU +#define TRMM_LCLN STRMM_LTLN +#define TRMM_RNUU STRMM_RNUU +#define TRMM_RNUN STRMM_RNUN +#define TRMM_RNLU STRMM_RNLU +#define TRMM_RNLN STRMM_RNLN +#define TRMM_RTUU STRMM_RTUU +#define TRMM_RTUN STRMM_RTUN +#define TRMM_RTLU STRMM_RTLU +#define TRMM_RTLN STRMM_RTLN +#define TRMM_RRUU STRMM_RNUU +#define TRMM_RRUN STRMM_RNUN +#define TRMM_RRLU STRMM_RNLU +#define TRMM_RRLN STRMM_RNLN +#define TRMM_RCUU STRMM_RTUU +#define TRMM_RCUN STRMM_RTUN +#define TRMM_RCLU STRMM_RTLU +#define TRMM_RCLN STRMM_RTLN + +#define TRSM_LNUU STRSM_LNUU +#define TRSM_LNUN STRSM_LNUN +#define TRSM_LNLU STRSM_LNLU +#define TRSM_LNLN STRSM_LNLN +#define TRSM_LTUU STRSM_LTUU +#define TRSM_LTUN STRSM_LTUN +#define TRSM_LTLU STRSM_LTLU +#define TRSM_LTLN STRSM_LTLN +#define TRSM_LRUU STRSM_LNUU +#define TRSM_LRUN STRSM_LNUN +#define TRSM_LRLU STRSM_LNLU +#define TRSM_LRLN STRSM_LNLN +#define TRSM_LCUU STRSM_LTUU +#define TRSM_LCUN STRSM_LTUN +#define TRSM_LCLU STRSM_LTLU +#define TRSM_LCLN STRSM_LTLN +#define TRSM_RNUU STRSM_RNUU +#define TRSM_RNUN STRSM_RNUN +#define TRSM_RNLU STRSM_RNLU +#define TRSM_RNLN STRSM_RNLN +#define TRSM_RTUU STRSM_RTUU +#define TRSM_RTUN STRSM_RTUN +#define TRSM_RTLU STRSM_RTLU +#define TRSM_RTLN STRSM_RTLN +#define TRSM_RRUU STRSM_RNUU +#define TRSM_RRUN STRSM_RNUN +#define TRSM_RRLU STRSM_RNLU +#define TRSM_RRLN STRSM_RNLN +#define TRSM_RCUU STRSM_RTUU +#define TRSM_RCUN STRSM_RTUN +#define TRSM_RCLU STRSM_RTLU +#define TRSM_RCLN STRSM_RTLN +#define SYRK_UN SSYRK_UN +#define SYRK_UT SSYRK_UT +#define SYRK_LN SSYRK_LN +#define SYRK_LT SSYRK_LT +#define SYRK_UR SSYRK_UN +#define SYRK_UC SSYRK_UT +#define SYRK_LR SSYRK_LN +#define SYRK_LC SSYRK_LT + +#define SYRK_KERNEL_U SSYRK_KERNEL_U +#define SYRK_KERNEL_L SSYRK_KERNEL_L + +#define HERK_UN SSYRK_UN +#define HERK_LN SSYRK_LN +#define HERK_UC SSYRK_UT +#define HERK_LC SSYRK_LT + +#define HER2K_UN SSYR2K_UN +#define HER2K_LN SSYR2K_LN +#define HER2K_UC SSYR2K_UT +#define HER2K_LC SSYR2K_LT + +#define SYR2K_UN SSYR2K_UN +#define SYR2K_UT SSYR2K_UT +#define SYR2K_LN SSYR2K_LN +#define SYR2K_LT SSYR2K_LT +#define SYR2K_UR SSYR2K_UN +#define SYR2K_UC SSYR2K_UT +#define SYR2K_LR SSYR2K_LN +#define SYR2K_LC SSYR2K_LT + +#define SYR2K_KERNEL_U SSYR2K_KERNEL_U +#define SYR2K_KERNEL_L SSYR2K_KERNEL_L +#define SYRK_THREAD_UN SSYRK_THREAD_UN +#define SYRK_THREAD_UT SSYRK_THREAD_UT +#define SYRK_THREAD_LN SSYRK_THREAD_LN +#define SYRK_THREAD_LT SSYRK_THREAD_LT +#define SYRK_THREAD_UR SSYRK_THREAD_UR +#define SYRK_THREAD_UC SSYRK_THREAD_UC +#define SYRK_THREAD_LR SSYRK_THREAD_LN +#define SYRK_THREAD_LC SSYRK_THREAD_LT + +#define HERK_THREAD_UN SSYRK_THREAD_UN +#define HERK_THREAD_UT SSYRK_THREAD_UT +#define HERK_THREAD_LN SSYRK_THREAD_LN +#define HERK_THREAD_LT SSYRK_THREAD_LT +#define HERK_THREAD_UR SSYRK_THREAD_UR +#define HERK_THREAD_UC SSYRK_THREAD_UC +#define HERK_THREAD_LR SSYRK_THREAD_LN +#define HERK_THREAD_LC SSYRK_THREAD_LT + +#define OMATCOPY_K_CN SOMATCOPY_K_CN +#define OMATCOPY_K_RN SOMATCOPY_K_RN +#define OMATCOPY_K_CT SOMATCOPY_K_CT +#define OMATCOPY_K_RT SOMATCOPY_K_RT +#define IMATCOPY_K_CN SIMATCOPY_K_CN +#define IMATCOPY_K_RN SIMATCOPY_K_RN +#define IMATCOPY_K_CT SIMATCOPY_K_CT +#define IMATCOPY_K_RT SIMATCOPY_K_RT + +#define GEADD_K SGEADD_K + +#endif + #else #define AMAX_K SAMAX_K @@ -673,14 +956,14 @@ #define GEMV_S SGEMV_S #define GEMV_D SGEMV_D + +#define SYMV_U SSYMV_U +#define SYMV_L SSYMV_L #define GERU_K SGERU_K #define GERC_K SGERC_K #define GERV_K SGERV_K #define GERD_K SGERD_K -#define SYMV_U SSYMV_U -#define SYMV_L SSYMV_L - #define SYMV_THREAD_U SSYMV_THREAD_U #define SYMV_THREAD_L SSYMV_THREAD_L @@ -2202,6 +2485,9 @@ #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) extern BLASLONG gemm_offset_a; extern BLASLONG gemm_offset_b; +extern BLASLONG shgemm_p; +extern BLASLONG shgemm_q; +extern BLASLONG shgemm_r; extern BLASLONG sgemm_p; extern BLASLONG sgemm_q; extern BLASLONG sgemm_r; diff --git a/common_mips.h b/common_mips.h index 2cc923043..7dc3ba246 100644 --- a/common_mips.h +++ b/common_mips.h @@ -43,6 +43,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef ASSEMBLER +#if !defined(MIPS24K) static inline unsigned int rpcc(void){ unsigned long ret; @@ -53,6 +54,7 @@ static inline unsigned int rpcc(void){ return ret; } #define RPCC_DEFINED +#endif static inline int blas_quickdivide(blasint x, blasint y){ return x / y; @@ -92,7 +94,7 @@ REALNAME: #endif #define HUGE_PAGESIZE ( 4 << 20) -#define BUFFER_SIZE (16 << 20) +#define BUFFER_SIZE (16 << 21) #define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) diff --git a/common_mips64.h b/common_mips64.h index af638d60c..a06edfe08 100644 --- a/common_mips64.h +++ b/common_mips64.h @@ -227,7 +227,7 @@ REALNAME: ;\ #define SEEK_ADDRESS -#define BUFFER_SIZE ( 32 << 20) +#define BUFFER_SIZE ( 32 << 21) #if defined(LOONGSON3A) #define PAGESIZE (16UL << 10) diff --git a/common_param.h b/common_param.h index 574d5e176..c92609a76 100644 --- a/common_param.h +++ b/common_param.h @@ -47,6 +47,100 @@ typedef struct { int dtb_entries; int offsetA, offsetB, align; +#ifdef BUILD_HALF + int shgemm_p, shgemm_q, shgemm_r; + int shgemm_unroll_m, shgemm_unroll_n, shgemm_unroll_mn; + + float (*shamax_k) (BLASLONG, float *, BLASLONG); + float (*shamin_k) (BLASLONG, float *, BLASLONG); + float (*shmax_k) (BLASLONG, float *, BLASLONG); + float (*shmin_k) (BLASLONG, float *, BLASLONG); +BLASLONG (*ishamax_k)(BLASLONG, float *, BLASLONG); +BLASLONG (*ishamin_k)(BLASLONG, float *, BLASLONG); +BLASLONG (*ishmax_k) (BLASLONG, float *, BLASLONG); +BLASLONG (*ishmin_k) (BLASLONG, float *, BLASLONG); + + float (*shnrm2_k) (BLASLONG, float *, BLASLONG); + float (*shasum_k) (BLASLONG, float *, BLASLONG); + float (*shsum_k) (BLASLONG, float *, BLASLONG); + int (*shcopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); + float (*shdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); + double (*dshdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); + + int (*shrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); + + int (*shaxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + int (*shscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + int (*shswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + + int (*shgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*shgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*shger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + + int (*shsymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*shsymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + + int (*shgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG); + int (*shgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG); + + int (*shgemm_incopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); + int (*shgemm_itcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); + int (*shgemm_oncopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); + int (*shgemm_otcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); + + int (*shtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*shtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*shtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*shtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + + int (*shtrsm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*shtrsm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*shtrsm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*shtrsm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*shtrsm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*shtrsm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*shtrsm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*shtrsm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*shtrsm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*shtrsm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*shtrsm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*shtrsm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*shtrsm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*shtrsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*shtrsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*shtrsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + + int (*shtrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*shtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*shtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*shtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + + int (*shtrmm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shtrmm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shtrmm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shtrmm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shtrmm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shtrmm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shtrmm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shtrmm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shtrmm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shtrmm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shtrmm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shtrmm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shtrmm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shtrmm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shtrmm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shtrmm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + + int (*shsymm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shsymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shsymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shsymm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + + int (*shneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); + int (*shlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); + +#endif int sgemm_p, sgemm_q, sgemm_r; int sgemm_unroll_m, sgemm_unroll_n, sgemm_unroll_mn; @@ -84,6 +178,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*sgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); int (*sgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + int (*sgemm_incopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*sgemm_itcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*sgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); @@ -907,6 +1002,15 @@ extern gotoblas_t *gotoblas; #define HAVE_EX_L2 gotoblas -> exclusive_cache +#ifdef BUILD_HALF +#define SHGEMM_P gotoblas -> shgemm_p +#define SHGEMM_Q gotoblas -> shgemm_q +#define SHGEMM_R gotoblas -> shgemm_r +#define SHGEMM_UNROLL_M gotoblas -> shgemm_unroll_m +#define SHGEMM_UNROLL_N gotoblas -> shgemm_unroll_n +#define SHGEMM_UNROLL_MN gotoblas -> shgemm_unroll_mn +#endif + #define SGEMM_P gotoblas -> sgemm_p #define SGEMM_Q gotoblas -> sgemm_q #define SGEMM_R gotoblas -> sgemm_r @@ -984,6 +1088,19 @@ extern gotoblas_t *gotoblas; #define HAVE_EX_L2 0 #endif +#ifdef BUILD_HALF +#define SHGEMM_P SHGEMM_DEFAULT_P +#define SHGEMM_Q SHGEMM_DEFAULT_Q +#define SHGEMM_R SHGEMM_DEFAULT_R +#define SHGEMM_UNROLL_M SHGEMM_DEFAULT_UNROLL_M +#define SHGEMM_UNROLL_N SHGEMM_DEFAULT_UNROLL_N +#ifdef SHGEMM_DEFAULT_UNROLL_MN +#define SHGEMM_UNROLL_MN SHGEMM_DEFAULT_UNROLL_MN +#else +#define SHGEMM_UNROLL_MN MAX((SHGEMM_UNROLL_M), (SHGEMM_UNROLL_N)) +#endif +#endif + #define SGEMM_P SGEMM_DEFAULT_P #define SGEMM_Q SGEMM_DEFAULT_Q #define SGEMM_R SGEMM_DEFAULT_R @@ -1119,6 +1236,18 @@ extern gotoblas_t *gotoblas; #define GEMM_DEFAULT_R DGEMM_DEFAULT_R #define GEMM_DEFAULT_UNROLL_M DGEMM_DEFAULT_UNROLL_M #define GEMM_DEFAULT_UNROLL_N DGEMM_DEFAULT_UNROLL_N +#elif defined(HALF) +#define GEMM_P SHGEMM_P +#define GEMM_Q SHGEMM_Q +#define GEMM_R SHGEMM_R +#define GEMM_UNROLL_M SHGEMM_UNROLL_M +#define GEMM_UNROLL_N SHGEMM_UNROLL_N +#define GEMM_UNROLL_MN SHGEMM_UNROLL_MN +#define GEMM_DEFAULT_P SHGEMM_DEFAULT_P +#define GEMM_DEFAULT_Q SHGEMM_DEFAULT_Q +#define GEMM_DEFAULT_R SHGEMM_DEFAULT_R +#define GEMM_DEFAULT_UNROLL_M SHGEMM_DEFAULT_UNROLL_M +#define GEMM_DEFAULT_UNROLL_N SHGEMM_DEFAULT_UNROLL_N #else #define GEMM_P SGEMM_P #define GEMM_Q SGEMM_Q @@ -1204,28 +1333,32 @@ extern gotoblas_t *gotoblas; #define GEMM_THREAD gemm_thread_n #endif +#ifndef SHGEMM_DEFAULT_R +#define SHGEMM_DEFAULT_R (((BUFFER_SIZE - ((SHGEMM_DEFAULT_P * SHGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SHGEMM_DEFAULT_Q * 4) - 15) & ~15UL) +#endif + #ifndef SGEMM_DEFAULT_R -#define SGEMM_DEFAULT_R (((BUFFER_SIZE - ((SGEMM_DEFAULT_P * SGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SGEMM_DEFAULT_Q * 4) - 15) & ~15) +#define SGEMM_DEFAULT_R (((BUFFER_SIZE - ((SGEMM_DEFAULT_P * SGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SGEMM_DEFAULT_Q * 4) - 15) & ~15UL) #endif #ifndef DGEMM_DEFAULT_R -#define DGEMM_DEFAULT_R (((BUFFER_SIZE - ((DGEMM_DEFAULT_P * DGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (DGEMM_DEFAULT_Q * 8) - 15) & ~15) +#define DGEMM_DEFAULT_R (((BUFFER_SIZE - ((DGEMM_DEFAULT_P * DGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (DGEMM_DEFAULT_Q * 8) - 15) & ~15UL) #endif #ifndef QGEMM_DEFAULT_R -#define QGEMM_DEFAULT_R (((BUFFER_SIZE - ((QGEMM_DEFAULT_P * QGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (QGEMM_DEFAULT_Q * 16) - 15) & ~15) +#define QGEMM_DEFAULT_R (((BUFFER_SIZE - ((QGEMM_DEFAULT_P * QGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (QGEMM_DEFAULT_Q * 16) - 15) & ~15UL) #endif #ifndef CGEMM_DEFAULT_R -#define CGEMM_DEFAULT_R (((BUFFER_SIZE - ((CGEMM_DEFAULT_P * CGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (CGEMM_DEFAULT_Q * 8) - 15) & ~15) +#define CGEMM_DEFAULT_R (((BUFFER_SIZE - ((CGEMM_DEFAULT_P * CGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (CGEMM_DEFAULT_Q * 8) - 15) & ~15UL) #endif #ifndef ZGEMM_DEFAULT_R -#define ZGEMM_DEFAULT_R (((BUFFER_SIZE - ((ZGEMM_DEFAULT_P * ZGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (ZGEMM_DEFAULT_Q * 16) - 15) & ~15) +#define ZGEMM_DEFAULT_R (((BUFFER_SIZE - ((ZGEMM_DEFAULT_P * ZGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (ZGEMM_DEFAULT_Q * 16) - 15) & ~15UL) #endif #ifndef XGEMM_DEFAULT_R -#define XGEMM_DEFAULT_R (((BUFFER_SIZE - ((XGEMM_DEFAULT_P * XGEMM_DEFAULT_Q * 32 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (XGEMM_DEFAULT_Q * 32) - 15) & ~15) +#define XGEMM_DEFAULT_R (((BUFFER_SIZE - ((XGEMM_DEFAULT_P * XGEMM_DEFAULT_Q * 32 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (XGEMM_DEFAULT_Q * 32) - 15) & ~15UL) #endif #ifndef SNUMOPT diff --git a/common_power.h b/common_power.h index e29d0f382..aa19794b5 100644 --- a/common_power.h +++ b/common_power.h @@ -68,7 +68,7 @@ #endif -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #define MB __asm__ __volatile__ ("eieio":::"memory") #define WMB __asm__ __volatile__ ("eieio":::"memory") #define RMB __asm__ __volatile__ ("eieio":::"memory") @@ -272,7 +272,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define HAVE_PREFETCH #endif -#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || defined(PPC970) +#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || defined(POWER10) || defined(PPC970) #define DCBT_ARG 0 #else #define DCBT_ARG 8 @@ -294,7 +294,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define L1_PREFETCH dcbtst #endif -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #define L1_DUALFETCH #define L1_PREFETCHSIZE (16 + 128 * 100) #define L1_PREFETCH dcbtst @@ -843,7 +843,7 @@ Lmcount$lazy_ptr: #define BUFFER_SIZE ( 2 << 20) #elif defined(PPC440FP2) #define BUFFER_SIZE ( 16 << 20) -#elif defined(POWER8) || defined(POWER9) +#elif defined(POWER8) || defined(POWER9) || defined(POWER10) #define BUFFER_SIZE ( 64 << 20) #else #define BUFFER_SIZE ( 16 << 20) diff --git a/common_sh.h b/common_sh.h new file mode 100644 index 000000000..7a0045762 --- /dev/null +++ b/common_sh.h @@ -0,0 +1,65 @@ +#ifndef COMMON_SH_H +#define COMMON_SH_H + +#ifndef DYNAMIC_ARCH + +#define SHGEMM_ONCOPY shgemm_oncopy +#define SHGEMM_OTCOPY shgemm_otcopy + +#if SHGEMM_DEFAULT_UNROLL_M == SHGEMM_DEFAULT_UNROLL_N +#define SHGEMM_INCOPY shgemm_oncopy +#define SHGEMM_ITCOPY shgemm_otcopy +#else +#define SHGEMM_INCOPY shgemm_incopy +#define SHGEMM_ITCOPY shgemm_itcopy +#endif +#define SHGEMM_BETA shgemm_beta +#define SHGEMM_KERNEL shgemm_kernel + +#else + +#define SHGEMM_ONCOPY gotoblas -> shgemm_oncopy +#define SHGEMM_OTCOPY gotoblas -> shgemm_otcopy +#define SHGEMM_INCOPY gotoblas -> shgemm_incopy +#define SHGEMM_ITCOPY gotoblas -> shgemm_itcopy +#define SHGEMM_BETA gotoblas -> shgemm_beta +#define SHGEMM_KERNEL gotoblas -> shgemm_kernel + +#endif + +#define SHGEMM_NN shgemm_nn +#define SHGEMM_CN shgemm_tn +#define SHGEMM_TN shgemm_tn +#define SHGEMM_NC shgemm_nt +#define SHGEMM_NT shgemm_nt +#define SHGEMM_CC shgemm_tt +#define SHGEMM_CT shgemm_tt +#define SHGEMM_TC shgemm_tt +#define SHGEMM_TT shgemm_tt +#define SHGEMM_NR shgemm_nn +#define SHGEMM_TR shgemm_tn +#define SHGEMM_CR shgemm_tn +#define SHGEMM_RN shgemm_nn +#define SHGEMM_RT shgemm_nt +#define SHGEMM_RC shgemm_nt +#define SHGEMM_RR shgemm_nn + +#define SHGEMM_THREAD_NN shgemm_thread_nn +#define SHGEMM_THREAD_CN shgemm_thread_tn +#define SHGEMM_THREAD_TN shgemm_thread_tn +#define SHGEMM_THREAD_NC shgemm_thread_nt +#define SHGEMM_THREAD_NT shgemm_thread_nt +#define SHGEMM_THREAD_CC shgemm_thread_tt +#define SHGEMM_THREAD_CT shgemm_thread_tt +#define SHGEMM_THREAD_TC shgemm_thread_tt +#define SHGEMM_THREAD_TT shgemm_thread_tt +#define SHGEMM_THREAD_NR shgemm_thread_nn +#define SHGEMM_THREAD_TR shgemm_thread_tn +#define SHGEMM_THREAD_CR shgemm_thread_tn +#define SHGEMM_THREAD_RN shgemm_thread_nn +#define SHGEMM_THREAD_RT shgemm_thread_nt +#define SHGEMM_THREAD_RC shgemm_thread_nt +#define SHGEMM_THREAD_RR shgemm_thread_nn + +#endif + diff --git a/common_x86_64.h b/common_x86_64.h index 0247674cd..15d0c30aa 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -80,7 +80,7 @@ static void __inline blas_lock(volatile BLASULONG *address){ #endif do { - while (*address) {YIELDING;}; + while (*address) {YIELDING;} #ifndef C_MSVC __asm__ __volatile__( @@ -199,9 +199,9 @@ static __inline BLASLONG blas_quickdivide(BLASLONG x, BLASLONG y){ #else extern unsigned int blas_quick_divide_table[]; -static __inline int blas_quickdivide(unsigned int x, unsigned int y){ +static __inline unsigned int blas_quickdivide(unsigned int x, unsigned int y){ - unsigned int result; + volatile unsigned int result; if (y <= 1) return x; @@ -215,7 +215,6 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ y = blas_quick_divide_table[y]; __asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x) : "0" (y)); - return result; } #endif diff --git a/cpp_thread_test/cpp_thread_safety_common.h b/cpp_thread_test/cpp_thread_safety_common.h index 60ab5bb2f..8005369a8 100644 --- a/cpp_thread_test/cpp_thread_safety_common.h +++ b/cpp_thread_test/cpp_thread_safety_common.h @@ -5,6 +5,14 @@ inline void pauser(){ std::getline(std::cin, dummy); } +void FailIfThreadsAreZero(uint32_t numConcurrentThreads) { + if(numConcurrentThreads == 0) { + std::cout<<"ERROR: Invalid parameter 0 for number of concurrent calls into OpenBLAS!"<>& matBlock, std::mt19937_64& PRNG, std::uniform_real_distribution& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){ for(uint32_t i=0; i(randomMatSize*randomMatSize); j++){ diff --git a/cpp_thread_test/dgemm_thread_safety.cpp b/cpp_thread_test/dgemm_thread_safety.cpp index 1c5287524..104c64f2a 100644 --- a/cpp_thread_test/dgemm_thread_safety.cpp +++ b/cpp_thread_test/dgemm_thread_safety.cpp @@ -46,6 +46,8 @@ int main(int argc, char* argv[]){ std::cout<<"Number of concurrent calls into OpenBLAS : "<(randomMatSize*randomMatSize)*numConcurrentThreads*3*8)/static_cast(1024*1024)<<" MiB of RAM\n"< 4){ std::cout<<"ERROR: too many arguments for thread safety tester"<(randomMatSize*randomMatSize)*numConcurrentThreads*8)+(static_cast(randomMatSize)*numConcurrentThreads*8*2))/static_cast(1024*1024)<<" MiB of RAM\n"<> 16 ) { + case 0x80: // POWER10 + return CPUTYPE_POWER10; + break; case 0x4e: // POWER9 return CPUTYPE_POWER9; break; diff --git a/cpuid_x86.c b/cpuid_x86.c index e29adecae..356800b78 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1406,6 +1406,17 @@ int get_cpuname(void){ return CPUTYPE_SANDYBRIDGE; else return CPUTYPE_NEHALEM; + } + case 10: //family 6 exmodel 10 + switch (model) { + case 5: // Comet Lake H and S + case 6: // Comet Lake U + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; } break; } @@ -1955,6 +1966,19 @@ int get_coretype(void){ return CORE_NEHALEM; } break; + case 10: + switch (model) { + case 5: // Comet Lake H and S + case 6: // Comet Lake U + if(support_avx()) + #ifndef NO_AVX2 + return CORE_HASWELL; + #else + return CORE_SANDYBRIDGE; + #endif + else + return CORE_NEHALEM; + } case 5: switch (model) { case 6: diff --git a/ctest/CMakeLists.txt b/ctest/CMakeLists.txt index 14c9d1944..8d301c239 100644 --- a/ctest/CMakeLists.txt +++ b/ctest/CMakeLists.txt @@ -12,6 +12,9 @@ FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh foreach(float_type ${FLOAT_TYPES}) string(SUBSTRING ${float_type} 0 1 float_char_upper) string(TOLOWER ${float_char_upper} float_char) + if (${float_char} STREQUAL "h") + continue() + endif() #level1 add_executable(x${float_char}cblat1 c_${float_char}blat1.f diff --git a/driver/level3/Makefile b/driver/level3/Makefile index e320092e3..09a62d9bf 100644 --- a/driver/level3/Makefile +++ b/driver/level3/Makefile @@ -19,6 +19,10 @@ ifeq ($(ARCH), MIPS) USE_GEMM3M = 1 endif +ifeq ($(BUILD_HALF),1) +SHBLASOBJS += shgemm_nn.$(SUFFIX) shgemm_nt.$(SUFFIX) shgemm_tn.$(SUFFIX) shgemm_tt.$(SUFFIX) +endif + SBLASOBJS += \ sgemm_nn.$(SUFFIX) sgemm_nt.$(SUFFIX) sgemm_tn.$(SUFFIX) sgemm_tt.$(SUFFIX) \ strmm_LNUU.$(SUFFIX) strmm_LNUN.$(SUFFIX) strmm_LNLU.$(SUFFIX) strmm_LNLN.$(SUFFIX) \ @@ -203,7 +207,9 @@ COMMONOBJS += gemm_thread_m.$(SUFFIX) gemm_thread_n.$(SUFFIX) gemm_thread_mn.$( COMMONOBJS += syrk_thread.$(SUFFIX) ifndef USE_SIMPLE_THREADED_LEVEL3 - +ifeq ($(BUILD_HALF),1) +SHBLASOBJS += shgemm_thread_nn.$(SUFFIX) shgemm_thread_nt.$(SUFFIX) shgemm_thread_tn.$(SUFFIX) shgemm_thread_tt.$(SUFFIX) +endif SBLASOBJS += sgemm_thread_nn.$(SUFFIX) sgemm_thread_nt.$(SUFFIX) sgemm_thread_tn.$(SUFFIX) sgemm_thread_tt.$(SUFFIX) DBLASOBJS += dgemm_thread_nn.$(SUFFIX) dgemm_thread_nt.$(SUFFIX) dgemm_thread_tn.$(SUFFIX) dgemm_thread_tt.$(SUFFIX) QBLASOBJS += qgemm_thread_nn.$(SUFFIX) qgemm_thread_nt.$(SUFFIX) qgemm_thread_tn.$(SUFFIX) qgemm_thread_tt.$(SUFFIX) @@ -283,6 +289,18 @@ endif all :: +shgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) + +shgemm_nt.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) + +shgemm_tn.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) + +shgemm_tt.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) + sgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) @@ -478,6 +496,17 @@ gemm_thread_variable.$(SUFFIX) : gemm_thread_variable.c ../../common.h beta_thread.$(SUFFIX) : beta_thread.c ../../common.h $(CC) -c $(CFLAGS) $< -o $(@F) +shgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) + +shgemm_thread_nt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) + +shgemm_thread_tn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) + +shgemm_thread_tt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) sgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) @@ -2652,6 +2681,18 @@ xtrsm_RCLU.$(SUFFIX) : trsm_R.c xtrsm_RCLN.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) +shgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) + +shgemm_nt.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) + +shgemm_tn.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) + +shgemm_tt.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) + sgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) @@ -2848,6 +2889,18 @@ beta_thread.$(PSUFFIX) : beta_thread.c ../../common.h $(CC) -c $(PFLAGS) $< -o $(@F) +shgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) + +shgemm_thread_nt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) + +shgemm_thread_tn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) + +shgemm_thread_tt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) + sgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) diff --git a/driver/level3/level3.c b/driver/level3/level3.c index 9aa67286f..c6bbb9ca9 100644 --- a/driver/level3/level3.c +++ b/driver/level3/level3.c @@ -62,18 +62,18 @@ #ifndef ICOPY_OPERATION #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \ defined(RN) || defined(RT) || defined(RC) || defined(RR) -#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (IFLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); #else -#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (IFLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); #endif #endif #ifndef OCOPY_OPERATION #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \ defined(NR) || defined(TR) || defined(CR) || defined(RR) -#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (IFLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); #else -#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (IFLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); #endif #endif @@ -173,7 +173,8 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, XFLOAT *sa, XFLOAT *sb, BLASLONG dummy){ BLASLONG k, lda, ldb, ldc; FLOAT *alpha, *beta; - FLOAT *a, *b, *c; + IFLOAT *a, *b; + FLOAT *c; BLASLONG m_from, m_to, n_from, n_to; BLASLONG ls, is, js; @@ -198,8 +199,8 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, k = K; - a = (FLOAT *)A; - b = (FLOAT *)B; + a = (IFLOAT *)A; + b = (IFLOAT *)B; c = (FLOAT *)C; lda = LDA; diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index ca0085e71..5a8d497d2 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -117,18 +117,18 @@ typedef struct { #ifndef ICOPY_OPERATION #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \ defined(RN) || defined(RT) || defined(RC) || defined(RR) -#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (IFLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); #else -#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (IFLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); #endif #endif #ifndef OCOPY_OPERATION #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \ defined(NR) || defined(TR) || defined(CR) || defined(RR) -#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (IFLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); #else -#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (IFLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); #endif #endif @@ -219,15 +219,16 @@ typedef struct { #define STOP_RPCC(COUNTER) #endif -static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ +static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IFLOAT *sb, BLASLONG mypos){ - FLOAT *buffer[DIVIDE_RATE]; + IFLOAT *buffer[DIVIDE_RATE]; BLASLONG k, lda, ldb, ldc; BLASLONG m_from, m_to, n_from, n_to; FLOAT *alpha, *beta; - FLOAT *a, *b, *c; + IFLOAT *a, *b; + FLOAT *c; job_t *job = (job_t *)args -> common; BLASLONG nthreads_m; @@ -255,8 +256,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, k = K; - a = (FLOAT *)A; - b = (FLOAT *)B; + a = (IFLOAT *)A; + b = (IFLOAT *)B; c = (FLOAT *)C; lda = LDA; @@ -425,7 +426,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Apply kernel with local region of A and part of other region of B */ START_RPCC(); KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - js, div_n), min_l, alpha, - sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], + sa, (IFLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], c, ldc, m_from, js); STOP_RPCC(kernel); @@ -469,7 +470,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Apply kernel with local region of A and part of region of B */ START_RPCC(); KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - js, div_n), min_l, alpha, - sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], + sa, (IFLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], c, ldc, is, js); STOP_RPCC(kernel); @@ -532,7 +533,7 @@ static int round_up(int remainder, int width, int multiple) static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG - *range_n, FLOAT *sa, FLOAT *sb, + *range_n, IFLOAT *sa, IFLOAT *sb, BLASLONG nthreads_m, BLASLONG nthreads_n) { #ifndef USE_OPENMP @@ -728,7 +729,7 @@ EnterCriticalSection((PCRITICAL_SECTION)&level3_lock); return 0; } -int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ +int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IFLOAT *sb, BLASLONG mypos){ BLASLONG m = args -> m; BLASLONG n = args -> n; diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index f13b83dd4..04b614a6e 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -272,7 +272,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ } } -#if defined(OS_LINUX) && !defined(NO_AFFINITY) +#if defined(OS_LINUX) && !defined(NO_AFFINITY) int gotoblas_set_affinity(int); int gotoblas_set_affinity2(int); int get_node(void); @@ -281,6 +281,8 @@ int get_node(void); static int increased_threads = 0; #ifdef OS_LINUX +extern int openblas_get_num_threads(void); + int openblas_setaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set) { const int active_threads = openblas_get_num_threads(); @@ -602,7 +604,7 @@ int blas_thread_init(void){ if(ret!=0){ struct rlimit rlim; const char *msg = strerror(ret); - fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create failed for thread %ld of %ld: %s\n", i+1,blas_num_threads,msg); + fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create failed for thread %ld of %d: %s\n", i+1,blas_num_threads,msg); #ifdef RLIMIT_NPROC if(0 == getrlimit(RLIMIT_NPROC, &rlim)) { fprintf(STDERR, "OpenBLAS blas_thread_init: RLIMIT_NPROC " diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 2e87e186a..c03b0b21d 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -332,7 +332,7 @@ int support_avx512(){ if((ebx & (1<<7)) == 0){ ret=0; //OS does not even support AVX2 } - if((ebx & (1<<31)) != 0){ + if((ebx & (1u<<31)) != 0){ xgetbv(0, &eax, &edx); if((eax & 0xe0) == 0xe0) ret=1; //OS supports AVX512VL @@ -618,6 +618,18 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } + case 10: + if (model == 5 || model == 6) { + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } return NULL; } case 0xf: @@ -632,7 +644,7 @@ static gotoblas_t *get_coretype(void){ cpuid(0x80000000, &eax, &ebx, &ecx, &edx); if ( (eax & 0xffff) >= 0x01) { cpuid(0x80000001, &eax, &ebx, &ecx, &edx); - if ((edx & (1 << 30)) == 0 || (edx & (1 << 31)) == 0) + if ((edx & (1 << 30)) == 0 || (edx & (1u << 31)) == 0) return NULL; } else @@ -764,18 +776,53 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_NORTHWOOD) return corename[ 3]; if (gotoblas == &gotoblas_PRESCOTT) return corename[ 4]; if (gotoblas == &gotoblas_BANIAS) return corename[ 5]; - if (gotoblas == &gotoblas_ATOM) return corename[ 6]; + if (gotoblas == &gotoblas_ATOM) +#ifdef DYNAMIC_OLDER + return corename[ 6]; +#else + return corename[10]; +#endif if (gotoblas == &gotoblas_CORE2) return corename[ 7]; - if (gotoblas == &gotoblas_PENRYN) return corename[ 8]; - if (gotoblas == &gotoblas_DUNNINGTON) return corename[ 9]; + if (gotoblas == &gotoblas_PENRYN) +#ifdef DYNAMIC_OLDER + return corename[ 8]; +#else + return corename[7]; +#endif + if (gotoblas == &gotoblas_DUNNINGTON) +#ifdef DYNAMIC_OLDER + return corename[ 9]; +#else + return corename[7]; +#endif if (gotoblas == &gotoblas_NEHALEM) return corename[10]; if (gotoblas == &gotoblas_ATHLON) return corename[11]; - if (gotoblas == &gotoblas_OPTERON_SSE3) return corename[12]; - if (gotoblas == &gotoblas_OPTERON) return corename[13]; + if (gotoblas == &gotoblas_OPTERON_SSE3) +#ifdef DYNAMIC_OLDER + return corename[12]; +#else + return corename[7]; +#endif + if (gotoblas == &gotoblas_OPTERON) +#ifdef DYNAMIC_OLDER + return corename[13]; +#else + return corename[7]; +#endif if (gotoblas == &gotoblas_BARCELONA) return corename[14]; - if (gotoblas == &gotoblas_NANO) return corename[15]; + if (gotoblas == &gotoblas_NANO) +#ifdef DYNAMIC_OLDER + return corename[15]; +#else + return corename[10]; +#endif if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16]; - if (gotoblas == &gotoblas_BOBCAT) return corename[17]; + if (gotoblas == &gotoblas_BOBCAT) +#ifdef DYNAMIC_OLDER + return corename[17]; +#else + return corename[7]; +#endif if (gotoblas == &gotoblas_BULLDOZER) return corename[18]; if (gotoblas == &gotoblas_PILEDRIVER) return corename[19]; if (gotoblas == &gotoblas_HASWELL) return corename[20]; @@ -787,6 +834,7 @@ char *gotoblas_corename(void) { } + static gotoblas_t *force_coretype(char *coretype){ int i ; diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index 8c831b998..811a5fae3 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -6,6 +6,9 @@ extern gotoblas_t gotoblas_POWER8; #if (!defined __GNUC__) || ( __GNUC__ >= 6) extern gotoblas_t gotoblas_POWER9; #endif +#if (!defined __GNUC__) || ( __GNUC__ >= 11) +extern gotoblas_t gotoblas_POWER10; +#endif extern void openblas_warning(int verbose, const char *msg); @@ -13,7 +16,8 @@ static char *corename[] = { "unknown", "POWER6", "POWER8", - "POWER9" + "POWER9", + "POWER10" }; #define NUM_CORETYPES 4 @@ -23,6 +27,9 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_POWER8) return corename[2]; #if (!defined __GNUC__) || ( __GNUC__ >= 6) if (gotoblas == &gotoblas_POWER9) return corename[3]; +#endif +#if (!defined __GNUC__) || ( __GNUC__ >= 11) + if (gotoblas == &gotoblas_POWER10) return corename[4]; #endif return corename[0]; } @@ -36,6 +43,10 @@ static gotoblas_t *get_coretype(void) { #if (!defined __GNUC__) || ( __GNUC__ >= 6) if (__builtin_cpu_is("power9")) return &gotoblas_POWER9; +#endif +#if (!defined __GNUC__) || ( __GNUC__ >= 11) + if (__builtin_cpu_is("isa_3_1") && __builtin_cpu_supports ("mma")) + return &gotoblas_POWER10; #endif return NULL; } @@ -61,6 +72,9 @@ static gotoblas_t *force_coretype(char * coretype) { case 2: return (&gotoblas_POWER8); #if (!defined __GNUC__) || ( __GNUC__ >= 6) case 3: return (&gotoblas_POWER9); +#endif +#if (!defined __GNUC__) || ( __GNUC__ >= 11) + case 4: return (&gotoblas_POWER10); #endif default: return NULL; } diff --git a/driver/others/dynamic_zarch.c b/driver/others/dynamic_zarch.c index 90d3051b1..403b34111 100644 --- a/driver/others/dynamic_zarch.c +++ b/driver/others/dynamic_zarch.c @@ -1,12 +1,58 @@ - #include "common.h" +#include +// Gate kernels for z13 and z14 on gcc version +#if (__GNUC__ == 5 && __GNUC_MINOR__ >= 2) || __GNUC__ >= 6 || \ + /* RHEL 7 since 7.3: */ \ + (__GNUC__ == 4 && __GNUC_MINOR__ == 8 && __GNUC_PATCHLEVEL__ == 5 && \ + __GNUC_RH_RELEASE__ >= 11) +#define HAVE_Z13_SUPPORT +#endif + +#if __GNUC__ >= 7 +#define HAVE_Z14_SUPPORT +#endif + +// Guard the use of getauxval() on glibc version >= 2.16 +#ifdef __GLIBC__ +#include +#if __GLIBC_PREREQ(2, 16) +#include +#define HAVE_GETAUXVAL 1 + +static unsigned long get_hwcap(void) +{ + unsigned long hwcap = getauxval(AT_HWCAP); + char *maskenv; + + // honor requests for not using specific CPU features in LD_HWCAP_MASK + maskenv = getenv("LD_HWCAP_MASK"); + if (maskenv) + hwcap &= strtoul(maskenv, NULL, 0); + + return hwcap; + // note that a missing auxval is interpreted as no capabilities + // available, which is safe. +} + +#else // __GLIBC_PREREQ(2, 16) +#warn "Cannot detect SIMD support in Z13 or newer architectures since glibc is older than 2.16" + +static unsigned long get_hwcap(void) { + // treat missing support for getauxval() as no capabilities available, + // which is safe. + return 0; +} +#endif // __GLIBC_PREREQ(2, 16) +#endif // __GLIBC + +extern gotoblas_t gotoblas_ZARCH_GENERIC; +#ifdef HAVE_Z13_SUPPORT extern gotoblas_t gotoblas_Z13; +#endif +#ifdef HAVE_Z14_SUPPORT extern gotoblas_t gotoblas_Z14; -//extern gotoblas_t gotoblas_Z15; -//#if (!defined C_GCC) || (GCC_VERSION >= 60000) -//extern gotoblas_t gotoblas_Z14; -//#endif +#endif #define NUM_CORETYPES 4 @@ -16,47 +62,50 @@ static char* corename[] = { "unknown", "Z13", "Z14", -// "Z15", "ZARCH_GENERIC", }; char* gotoblas_corename(void) { +#ifdef HAVE_Z13_SUPPORT if (gotoblas == &gotoblas_Z13) return corename[1]; +#endif +#ifdef HAVE_Z14_SUPPORT if (gotoblas == &gotoblas_Z14) return corename[2]; -// if (gotoblas == &gotoblas_Z15) return corename[3]; -//#if (!defined C_GCC) || (GCC_VERSION >= 60000) -// if (gotoblas == &gotoblas_POWER9) return corename[3]; -//#endif - return corename[0]; // try generic? +#endif + if (gotoblas == &gotoblas_ZARCH_GENERIC) return corename[3]; + + return corename[0]; } -// __builtin_cpu_is is not supported by zarch +/** + * Detect the fitting set of kernels by retrieving the CPU features supported by + * OS from the auxiliary value AT_HWCAP and choosing the set of kernels + * ("coretype") that exploits most of the features and can be compiled with the + * available gcc version. + * Note that we cannot use vector registers on a z13 or newer unless supported + * by the OS kernel (which needs to handle them properly during context switch). + */ static gotoblas_t* get_coretype(void) { - FILE* infile; - char buffer[512], * p; - p = (char*)NULL; - infile = fopen("/proc/sysinfo", "r"); - while (fgets(buffer, sizeof(buffer), infile)) { - if (!strncmp("Type", buffer, 4)) { - p = strchr(buffer, ':') + 2; -#if 0 - fprintf(stderr, "%s\n", p); + unsigned long hwcap __attribute__((unused)) = get_hwcap(); + + // z14 and z15 systems: exploit Vector Facility (SIMD) and + // Vector-Enhancements Facility 1 (float SIMD instructions), if present. +#ifdef HAVE_Z14_SUPPORT + if ((hwcap & HWCAP_S390_VX) && (hwcap & HWCAP_S390_VXE)) + return &gotoblas_Z14; #endif - break; - } - } - fclose(infile); + // z13: Vector Facility (SIMD for double) +#ifdef HAVE_Z13_SUPPORT + if (hwcap & HWCAP_S390_VX) + return &gotoblas_Z13; +#endif - if (strstr(p, "2964")) return &gotoblas_Z13; - if (strstr(p, "2965")) return &gotoblas_Z13; - if (strstr(p, "3906")) return &gotoblas_Z14; - if (strstr(p, "3907")) return &gotoblas_Z14; - if (strstr(p, "8561")) return &gotoblas_Z14; // fallback z15 to z14 - if (strstr(p, "8562")) return &gotoblas_Z14; // fallback z15 to z14 - - return NULL; // should be ZARCH_GENERIC + // fallback in case of missing compiler support, systems before z13, or + // when the OS does not advertise support for the Vector Facility (e.g., + // missing support in the OS kernel) + return &gotoblas_ZARCH_GENERIC; } static gotoblas_t* force_coretype(char* coretype) { @@ -76,12 +125,13 @@ static gotoblas_t* force_coretype(char* coretype) { switch (found) { +#ifdef HAVE_Z13_SUPPORT case 1: return (&gotoblas_Z13); +#endif +#ifdef HAVE_Z14_SUPPORT case 2: return (&gotoblas_Z14); -// case 3: return (&gotoblas_Z15); -//#if (!defined C_GCC) || (GCC_VERSION >= 60000) -// case 3: return (&gotoblas_POWER9); -//#endif +#endif + case 3: return (&gotoblas_ZARCH_GENERIC); default: return NULL; } snprintf(message, 128, "Core not found: %s\n", coretype); @@ -109,9 +159,9 @@ void gotoblas_dynamic_init(void) { if (gotoblas == NULL) { - snprintf(coremsg, 128, "Falling back to Z14 core\n"); + snprintf(coremsg, 128, "Failed to detect system, falling back to generic z support.\n"); openblas_warning(1, coremsg); - gotoblas = &gotoblas_Z14; + gotoblas = &gotoblas_ZARCH_GENERIC; } if (gotoblas && gotoblas->init) { diff --git a/driver/others/memory.c b/driver/others/memory.c index 5abcbf3a4..a5595aed4 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2070,7 +2070,7 @@ if (!release->address) return; if (munmap(release -> address, BUFFER_SIZE)) { int errsv=errno; perror("OpenBLAS : munmap failed:"); - printf("error code=%d,\trelease->address=%lx\n",errsv,release->address); + printf("error code=%d,\trelease->address=%p\n",errsv,release->address); } } diff --git a/driver/others/parameter.c b/driver/others/parameter.c index 8bf7da78b..b1f3befae 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -62,6 +62,11 @@ BLASLONG gemm_offset_b = DEFAULT_GEMM_OFFSET_B; BLASLONG gemm_offset_b = GEMM_OFFSET_B; #endif +#if SHGEMM_P == shgemm_p +BLASLONG shgemm_p = DEFAULT_GEMM_P; +#else +BLASLONG shgemm_p = SHGEMM_P; +#endif #if SGEMM_P == sgemm_p BLASLONG sgemm_p = DEFAULT_GEMM_P; #else @@ -83,6 +88,11 @@ BLASLONG zgemm_p = DEFAULT_GEMM_P; BLASLONG zgemm_p = ZGEMM_P; #endif +#if SHGEMM_Q == shgemm_q +BLASLONG shgemm_q = DEFAULT_GEMM_Q; +#else +BLASLONG shgemm_q = SHGEMM_Q; +#endif #if SGEMM_Q == sgemm_q BLASLONG sgemm_q = DEFAULT_GEMM_Q; #else @@ -104,6 +114,11 @@ BLASLONG zgemm_q = DEFAULT_GEMM_Q; BLASLONG zgemm_q = ZGEMM_Q; #endif +#if SHGEMM_R == shgemm_r +BLASLONG shgemm_r = DEFAULT_GEMM_R; +#else +BLASLONG shgemm_r = SHGEMM_R; +#endif #if SGEMM_R == sgemm_r BLASLONG sgemm_r = DEFAULT_GEMM_R; #else @@ -597,6 +612,7 @@ void blas_set_parameter(void){ size = BITMASK(cpuid3, 16, 0xff); + shgemm_p = 192 * (size + 1); sgemm_p = 192 * (size + 1); dgemm_p = 96 * (size + 1); cgemm_p = 96 * (size + 1); @@ -610,6 +626,7 @@ void blas_set_parameter(void){ xgemm_p = 16 * (size + 1); #endif + shgemm_r = (((BUFFER_SIZE - ((SHGEMM_P * SHGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SHGEMM_Q * 4)) - 15) & ~15; sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15; dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15; cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15; diff --git a/exports/Makefile b/exports/Makefile index 60291b1ff..01a313b35 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -30,6 +30,10 @@ ifndef BUILD_LAPACK_DEPRECATED BUILD_LAPACK_DEPRECATED = 0 endif +ifndef BUILD_HALF +BUILD_HALF = 0 +endif + ifeq ($(OSNAME), WINNT) ifeq ($(F_COMPILER), GFORTRAN) ifndef ONLY_CBLAS @@ -151,8 +155,12 @@ ifeq ($(F_COMPILER), INTEL) -Wl,--whole-archive $< -Wl,--no-whole-archive \ -Wl,-soname,$(INTERNALNAME) $(EXTRALIB) $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. +else ifeq ($(F_COMPILER), FLANG) + $(FC) $(FFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ + -Wl,--whole-archive $< -Wl,--no-whole-archive \ + -Wl,-soname,$(INTERNALNAME) $(EXTRALIB) + $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. else - ifneq ($(C_COMPILER), LSB) $(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ -Wl,--whole-archive $< -Wl,--no-whole-archive \ @@ -234,23 +242,23 @@ static : ../$(LIBNAME) rm -f goto.$(SUFFIX) osx.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) + perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F) aix.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) + perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F) objcopy.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) + perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F) objconv.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) + perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F) test : linktest.c $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK. rm -f linktest linktest.c : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > linktest.c + perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > linktest.c clean :: @rm -f *.def *.dylib __.SYMDEF* *.renamed diff --git a/exports/gensymbol b/exports/gensymbol index d2894e6c8..73b4be248 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -40,17 +40,13 @@ ztbsv,ztpmv,ztpsv,ztrmm,ztrmv,ztrsm,ztrsv, xerbla, saxpby,daxpby,caxpby,zaxpby, + somatcopy, domatcopy, comatcopy, zomatcopy, + simatcopy, dimatcopy, cimatcopy, zimatcopy, sgeadd,dgeadd,cgeadd,zgeadd, - somatcopy, - simatcopy, - domatcopy, - dimatcopy, - comatcopy, - cimatcopy, - zomatcopy, - zimatcopy, + ssum, dsum, scsum, dzsum ); +@halfblasobjs = (shgemm); @cblasobjs = ( cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv, cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k, @@ -80,9 +76,16 @@ cblas_saxpby,cblas_daxpby,cblas_caxpby,cblas_zaxpby, cblas_somatcopy, cblas_domatcopy, cblas_comatcopy, cblas_zomatcopy, cblas_simatcopy, cblas_dimatcopy, cblas_cimatcopy, cblas_zimatcopy, - cblas_sgeadd, cblas_dgeadd,cblas_cgeadd, cblas_zgeadd + cblas_sgeadd, cblas_dgeadd,cblas_cgeadd, cblas_zgeadd, + cblas_isamin, cblas_idamin, cblas_icamin, cblas_izamin, + cblas_ismin, cblas_idmin, cblas_icmin, cblas_izmin, + cblas_ismax, cblas_idmax, cblas_icmax, cblas_izmax, + cblas_ssum, cblas_dsum, cblas_scsum, cblas_dzsum, + cblas_xerbla ); +@halfcblasobjs = (cblas_shgemm); + @exblasobjs = ( qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm, qgemv,qger,qmax,qmin, @@ -3454,6 +3457,10 @@ use File::Spec; use File::Basename; my $dirname = File::Spec->catfile(dirname(dirname(File::Spec->rel2abs(__FILE__))), "lapack-netlib"); +if ($ARGV[12] == 1) { + @blasobjs = (@blasobjs, @halfblasobjs); + @cblasobjs = (@cblasobjs, @halfcblasobjs); +} if ($ARGV[8] == 1) { #ONLY_CBLAS=1 @underscore_objs = (@misc_underscore_objs); @@ -3494,9 +3501,12 @@ if ($ARGV[1] eq "x86") { @underscore_objs = (@underscore_objs, @gemm3mobjs); if ($ARGV[1] eq "ia64") { @underscore_objs = (@underscore_objs, @gemm3mobjs); }; if ($ARGV[1] eq "MIPS") { @underscore_objs = (@underscore_objs, @gemm3mobjs); }; - if ($ARGV[4] == 0) { @no_underscore_objs = (@cblasobjs, @misc_no_underscore_objs); + if ($ARGV[1] eq "x86_64") { @no_underscore_objs = (@no_underscore_objs, @cblasgemm3mobjs); }; + if ($ARGV[1] eq "x86") { @no_underscore_objs = (@no_underscore_objs, @cblasgemm3mobjs); }; + if ($ARGV[1] eq "ia64") { @no_underscore_objs = (@no_underscore_objs, @cblasgemm3mobjs); }; + if ($ARGV[1] eq "MIPS") { @no_underscore_objs = (@no_underscore_objs, @cblasgemm3mobjs); }; }else{ #NO_CBLAS=1 @no_underscore_objs = (@misc_no_underscore_objs); diff --git a/f_check b/f_check index fac8fc707..17d863224 100644 --- a/f_check +++ b/f_check @@ -334,7 +334,8 @@ if ($link ne "") { && ($flags !~ /kernel32/) && ($flags !~ /advapi32/) && ($flags !~ /shell32/) - && ($flags !~ /omp/) + && ($flags !~ /omp/ || ($vendor !~ /PGI/ && $flags =~ /omp/)) + && ($flags !~ /[0-9]+/) && ($flags !~ /^\-l$/) ) { $linker_l .= $flags . " "; diff --git a/getarch.c b/getarch.c index 145753bcc..164947f3e 100644 --- a/getarch.c +++ b/getarch.c @@ -650,6 +650,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "POWER9" #endif +#if defined(FORCE_POWER10) +#define FORCE +#define ARCHITECTURE "POWER" +#define SUBARCHITECTURE "POWER10" +#define SUBDIRNAME "power" +#define ARCHCONFIG "-DPOWER10 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \ + "-DL2_SIZE=4194304 -DL2_LINESIZE=128 " \ + "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "power10" +#define CORENAME "POWER10" +#endif + #ifdef FORCE_PPCG4 #define FORCE #define ARCHITECTURE "POWER" @@ -812,6 +825,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif +#ifdef FORCE_MIPS1004K +#define FORCE +#define ARCHITECTURE "MIPS" +#define SUBARCHITECTURE "MIPS1004K" +#define SUBDIRNAME "mips" +#define ARCHCONFIG "-DMIPS1004K " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "mips1004K" +#define CORENAME "MIPS1004K" +#else +#endif + +#ifdef FORCE_MIPS24K +#define FORCE +#define ARCHITECTURE "MIPS" +#define SUBARCHITECTURE "MIPS24K" +#define SUBDIRNAME "mips" +#define ARCHCONFIG "-DMIPS24K " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=32768 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "mips24K" +#define CORENAME "MIPS24K" +#else +#endif + #ifdef FORCE_I6500 #define FORCE #define ARCHITECTURE "MIPS" @@ -1334,10 +1375,12 @@ int main(int argc, char *argv[]){ #if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ printf("__BYTE_ORDER__=__ORDER_BIG_ENDIAN__\n"); -#endif -#if defined(__BIG_ENDIAN__) && __BIG_ENDIAN__ > 0 +#elif defined(__BIG_ENDIAN__) && __BIG_ENDIAN__ > 0 printf("__BYTE_ORDER__=__ORDER_BIG_ENDIAN__\n"); #endif +#if defined(_CALL_ELF) && (_CALL_ELF == 2) +printf("ELF_VERSION=2\n"); +#endif #ifdef MAKE_NB_JOBS #if MAKE_NB_JOBS > 0 diff --git a/getarch_2nd.c b/getarch_2nd.c index cf9c578cb..a1d0ccac8 100644 --- a/getarch_2nd.c +++ b/getarch_2nd.c @@ -9,6 +9,8 @@ int main(int argc, char **argv) { if ( (argc <= 1) || ((argc >= 2) && (*argv[1] == '0'))) { + printf("SHGEMM_UNROLL_M=%d\n", SHGEMM_DEFAULT_UNROLL_M); + printf("SHGEMM_UNROLL_N=%d\n", SHGEMM_DEFAULT_UNROLL_N); printf("SGEMM_UNROLL_M=%d\n", SGEMM_DEFAULT_UNROLL_M); printf("SGEMM_UNROLL_N=%d\n", SGEMM_DEFAULT_UNROLL_N); printf("DGEMM_UNROLL_M=%d\n", DGEMM_DEFAULT_UNROLL_M); diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index 5ea39f864..7a8fc6698 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -115,7 +115,7 @@ foreach (float_type ${FLOAT_TYPES}) GenerateNamedObjects("syr2k.c" "HEMM" "her2k" ${CBLAS_FLAG} "" "" false ${float_type}) if (USE_GEMM3M) - GenerateNamedObjects("gemm.c" "GEMM3M" "gemm3m" false "" "" false ${float_type}) + GenerateNamedObjects("gemm.c" "GEMM3M" "gemm3m" ${CBLAS_FLAG} "" "" false ${float_type}) endif() endif () if (${float_type} STREQUAL "COMPLEX") diff --git a/interface/Makefile b/interface/Makefile index 3f0dcca28..44a9fdcf0 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -46,6 +46,9 @@ SBLAS3OBJS = \ somatcopy.$(SUFFIX) simatcopy.$(SUFFIX)\ sgeadd.$(SUFFIX) +ifeq ($(BUILD_HALF),1) +SHBLAS3OBJS = shgemm.$(SUFFIX) +endif DBLAS1OBJS = \ daxpy.$(SUFFIX) dswap.$(SUFFIX) \ @@ -277,6 +280,10 @@ CSBLAS3OBJS = \ cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX)\ cblas_sgeadd.$(SUFFIX) +ifeq ($(BUILD_HALF),1) +CSHBLAS3OBJS = cblas_shgemm.$(SUFFIX) +endif + CDBLAS1OBJS = \ cblas_idamax.$(SUFFIX) cblas_idamin.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \ cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \ @@ -367,6 +374,7 @@ override CFLAGS += -I. SBLAS1OBJS += $(CSBLAS1OBJS) SBLAS2OBJS += $(CSBLAS2OBJS) SBLAS3OBJS += $(CSBLAS3OBJS) +SHBLAS3OBJS += $(CSHBLAS3OBJS) DBLAS1OBJS += $(CDBLAS1OBJS) DBLAS2OBJS += $(CDBLAS2OBJS) DBLAS3OBJS += $(CDBLAS3OBJS) @@ -380,6 +388,7 @@ ZBLAS3OBJS += $(CZBLAS3OBJS) endif SBLASOBJS = $(SBLAS1OBJS) $(SBLAS2OBJS) $(SBLAS3OBJS) +SHBLASOBJS = $(SHBLAS3OBJS) DBLASOBJS = $(DBLAS1OBJS) $(DBLAS2OBJS) $(DBLAS3OBJS) QBLASOBJS = $(QBLAS1OBJS) $(QBLAS2OBJS) $(QBLAS3OBJS) CBLASOBJS = $(CBLAS1OBJS) $(CBLAS2OBJS) $(CBLAS3OBJS) @@ -454,7 +463,7 @@ ZBLASOBJS += $(ZLAPACKOBJS) endif -FUNCOBJS = $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) +FUNCOBJS = $(SHBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) ifdef EXPRECISION FUNCOBJS += $(QBLASOBJS) $(XBLASOBJS) @@ -488,10 +497,10 @@ level1 : $(SBLAS1OBJS) $(DBLAS1OBJS) $(QBLAS1OBJS) $(CBLAS1OBJS) $(ZBLAS1OBJS) $ level2 : $(SBLAS2OBJS) $(DBLAS2OBJS) $(QBLAS2OBJS) $(CBLAS2OBJS) $(ZBLAS2OBJS) $(XBLAS2OBJS) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ -level3 : $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $(XBLAS3OBJS) +level3 : $(SHBLAS3OBJS) $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $(XBLAS3OBJS) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ -$(CSBLASOBJS) $(CSBLASOBJS_P) $(CDBLASOBJS) $(CDBLASOBJS_P) $(CQBLASOBJS) $(CQBLASOBJS_P) \ +$(CSHBLASOBJS) $(CSHBLASOBJS_P) $(CSBLASOBJS) $(CSBLASOBJS_P) $(CDBLASOBJS) $(CDBLASOBJS_P) $(CQBLASOBJS) $(CQBLASOBJS_P) \ $(CCBLASOBJS) $(CCBLASOBJS_P) $(CZBLASOBJS) $(CZBLASOBJS_P) $(CXBLASOBJS) $(CXBLASOBJS_P) : override CFLAGS += -DCBLAS srot.$(SUFFIX) srot.$(PSUFFIX) : rot.c @@ -1209,6 +1218,11 @@ zhpr2.$(SUFFIX) zhpr2.$(PSUFFIX) : zhpr2.c xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c $(CC) -c $(CFLAGS) $< -o $(@F) +ifeq ($(BUILD_HALF),1) +shgemm.$(SUFFIX) shgemm.$(PSUFFIX) : gemm.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) +endif + sgemm.$(SUFFIX) sgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) @@ -1770,6 +1784,11 @@ cblas_zhemv.$(SUFFIX) cblas_zhemv.$(PSUFFIX) : zhemv.c cblas_sgemm.$(SUFFIX) cblas_sgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) +ifeq ($(BUILD_HALF),1) +cblas_shgemm.$(SUFFIX) cblas_shgemm.$(PSUFFIX) : gemm.c ../param.h + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) +endif + cblas_dgemm.$(SUFFIX) cblas_dgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) diff --git a/interface/gemm.c b/interface/gemm.c index 0b18d9a8c..99388e7d9 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -77,7 +77,7 @@ #define GEMM_MULTITHREAD_THRESHOLD 4 #endif -static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { +static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, IFLOAT *, IFLOAT *, BLASLONG) = { #ifndef GEMM3M GEMM_NN, GEMM_TN, GEMM_RN, GEMM_CN, GEMM_NT, GEMM_TT, GEMM_RT, GEMM_CT, @@ -108,8 +108,8 @@ static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLA void NAME(char *TRANSA, char *TRANSB, blasint *M, blasint *N, blasint *K, FLOAT *alpha, - FLOAT *a, blasint *ldA, - FLOAT *b, blasint *ldB, + IFLOAT *a, blasint *ldA, + IFLOAT *b, blasint *ldB, FLOAT *beta, FLOAT *c, blasint *ldC){ @@ -119,8 +119,8 @@ void NAME(char *TRANSA, char *TRANSB, blasint info; char transA, transB; - FLOAT *buffer; - FLOAT *sa, *sb; + IFLOAT *buffer; + IFLOAT *sa, *sb; #ifdef SMP double MNK; diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 35e0fff25..d1349c5f8 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -41,6 +41,9 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) foreach (float_type ${FLOAT_TYPES}) # a bit of metaprogramming here to pull out the appropriate KERNEL var string(SUBSTRING ${float_type} 0 1 float_char) + if (${float_type} STREQUAL "HALF") + set (float_char "SH") + endif () GenerateNamedObjects("${KERNELDIR}/${${float_char}AMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}AMINKERNEL}" "USE_ABS;USE_MIN" "amin_k" false "" "" false ${float_type}) if (DEFINED ${float_char}MAXKERNEL) @@ -93,6 +96,9 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("generic/ger.c" "" "ger_k" false "" "" "" 3) foreach (float_type ${FLOAT_TYPES}) string(SUBSTRING ${float_type} 0 1 float_char) + if (${float_type} STREQUAL "HALF") + set (float_char "SH") + endif () if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") GenerateNamedObjects("${KERNELDIR}/${${float_char}GERUKERNEL}" "" "geru_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GERCKERNEL}" "CONJ" "gerc_k" false "" "" false ${float_type}) @@ -124,17 +130,27 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) if (ARM OR ARM64 OR (TARGET_CORE MATCHES LONGSOON3B) OR (TARGET_CORE MATCHES GENERIC) OR (TARGET_CORE MATCHES HASWELL) OR (TARGET_CORE MATCHES ZEN) OR (TARGET_CORE MATCHES SKYLAKEX) ) set(USE_TRMM true) endif () - if (ZARCH OR (TARGET_CORE MATCHES POWER8) OR (TARGET_CORE MATCHES POWER9)) + if (ZARCH OR (TARGET_CORE MATCHES POWER8) OR (TARGET_CORE MATCHES POWER9) OR (TARGET_CORE MATCHES POWER10)) set(USE_TRMM true) endif () - foreach (float_type SINGLE DOUBLE) + foreach (float_type SINGLE DOUBLE HALF) string(SUBSTRING ${float_type} 0 1 float_char) + if (${float_type} STREQUAL "HALF") + if (NOT ${BUILD_HALF}) + continue () + else () + set (float_char "SH") + endif () + endif () GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type}) endforeach() foreach (float_type ${FLOAT_TYPES}) string(SUBSTRING ${float_type} 0 1 float_char) + if (${float_type} STREQUAL "HALF") + set (float_char "SH") + endif () if (${float_char}GEMMINCOPY) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMINCOPY}" "${float_type}" "${${float_char}GEMMINCOPYOBJ}" false "" "" true ${float_type}) endif () @@ -470,9 +486,13 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEADD_KERNEL}" "" "geadd_k" false "" "" false ${float_type}) endforeach () + # Makefile.LA if(NOT NO_LAPACK) foreach (float_type ${FLOAT_TYPES}) + if (${float_type} STREQUAL "HALF") + set (float_char "SH") + endif () if (NOT DEFINED ${float_char}NEG_TCOPY) if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C" OR ${float_char} STREQUAL "X") set(${float_char}NEG_TCOPY ../generic/zneg_tcopy.c) @@ -516,6 +536,9 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) foreach (float_type ${FLOAT_TYPES}) # a bit of metaprogramming here to pull out the appropriate KERNEL var string(SUBSTRING ${float_type} 0 1 float_char) + if (${float_type} STREQUAL "HALF") + set (float_char "SH") + endif () GenerateNamedObjects("generic/neg_tcopy_${${float_char}GEMM_UNROLL_M}.c" "" "neg_tcopy" false "" ${TSUFFIX} false ${float_type}) GenerateNamedObjects("generic/laswp_ncopy_${${float_char}GEMM_UNROLL_N}.c" "" "laswp_ncopy" false "" ${TSUFFIX} false ${float_type}) endforeach () diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 6d96abb2e..86772cb22 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -51,6 +51,10 @@ ifeq ($(CORE), POWER9) USE_TRMM = 1 endif +ifeq ($(CORE), POWER10) +USE_TRMM = 1 +endif + ifeq ($(ARCH), zarch) USE_TRMM = 1 endif @@ -59,6 +63,25 @@ ifeq ($(CORE), Z14) USE_TRMM = 1 endif +ifeq ($(BUILD_HALF), 1) +ifndef SHGEMMKERNEL +SHGEMM_BETA = ../generic/gemm_beta.c +SHGEMMKERNEL = ../generic/gemmkernel_2x2.c +SHGEMMINCOPY = ../generic/gemm_ncopy_2.c +SHGEMMITCOPY = ../generic/gemm_tcopy_2.c +SHGEMMONCOPY = ../generic/gemm_ncopy_2.c +SHGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX) +SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX) +SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX) +SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX) +endif + +SHKERNELOBJS += \ + shgemm_kernel$(TSUFFIX).$(SUFFIX) \ + $(SHGEMMINCOPYOBJ) $(SHGEMMITCOPYOBJ) \ + $(SHGEMMONCOPYOBJ) $(SHGEMMOTCOPYOBJ) +endif SKERNELOBJS += \ sgemm_kernel$(TSUFFIX).$(SUFFIX) \ @@ -93,6 +116,9 @@ XKERNELOBJS += \ $(XGEMMINCOPYOBJ) $(XGEMMITCOPYOBJ) \ $(XGEMMONCOPYOBJ) $(XGEMMOTCOPYOBJ) +ifeq ($(BUILD_HALF),1) +SHBLASOBJS += $(SHKERNELOBJS) +endif SBLASOBJS += $(SKERNELOBJS) DBLASOBJS += $(DKERNELOBJS) QBLASOBJS += $(QKERNELOBJS) @@ -100,6 +126,10 @@ CBLASOBJS += $(CKERNELOBJS) ZBLASOBJS += $(ZKERNELOBJS) XBLASOBJS += $(XKERNELOBJS) +ifeq ($(BUILD_HALF),1) +SHBLASOBJS += shgemm_beta$(TSUFFIX).$(SUFFIX) +endif + SBLASOBJS += \ sgemm_beta$(TSUFFIX).$(SUFFIX) \ strmm_kernel_LN$(TSUFFIX).$(SUFFIX) strmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ @@ -389,6 +419,12 @@ ZBLASOBJS += \ zimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ zgeadd_k$(TSUFFIX).$(SUFFIX) +ifeq ($(BUILD_HALF), 1) +SHGEMMINCOPYOBJ_P = $(SHGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +SHGEMMITCOPYOBJ_P = $(SHGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +SHGEMMONCOPYOBJ_P = $(SHGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +SHGEMMOTCOPYOBJ_P = $(SHGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +endif SGEMMINCOPYOBJ_P = $(SGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) SGEMMITCOPYOBJ_P = $(SGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) @@ -415,6 +451,11 @@ XGEMMITCOPYOBJ_P = $(XGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) XGEMMONCOPYOBJ_P = $(XGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) XGEMMOTCOPYOBJ_P = $(XGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +ifeq ($(BUILD_HALF),1) +$(KDIR)shgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_BETA) + $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ +endif + $(KDIR)sgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_BETA) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ @@ -433,12 +474,47 @@ $(KDIR)zgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA) $(KDIR)xgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMM_BETA) $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@ + +ifeq ($(BUILD_HALF), 1) + +$(KDIR)$(SHGEMMONCOPYOBJ) : $(KERNELDIR)/$(SHGEMMONCOPY) + $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)$(SHGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SHGEMMOTCOPY) + +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX $< -o - > shgemmotcopy.s + m4 shgemmotcopy.s > shgemmotcopy_nomacros.s + $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemmotcopy_nomacros.s -o $@ + rm shgemmotcopy.s shgemmotcopy_nomacros.s +else + $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ +endif + +ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N)) + +$(KDIR)$(SHGEMMINCOPYOBJ) : $(KERNELDIR)/$(SHGEMMINCOPY) + $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)$(SHGEMMITCOPYOBJ) : $(KERNELDIR)/$(SHGEMMITCOPY) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX $< -o - > shgemmitcopy.s + m4 shgemmitcopy.s > shgemmitcopy_nomacros.s + $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemmitcopy_nomacros.s -o $@ + rm shgemmitcopy.s shgemmitcopy_nomacros.s +else + $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ +endif + +endif +endif + $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmotcopy.s + $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmotcopy.s m4 sgemmotcopy.s > sgemmotcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmotcopy_nomacros.s -o $@ rm sgemmotcopy.s sgemmotcopy_nomacros.s @@ -454,7 +530,7 @@ $(KDIR)$(SGEMMINCOPYOBJ) : $(KERNELDIR)/$(SGEMMINCOPY) $(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmitcopy.s + $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmitcopy.s m4 sgemmitcopy.s > sgemmitcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmitcopy_nomacros.s -o $@ rm sgemmitcopy.s sgemmitcopy_nomacros.s @@ -466,7 +542,7 @@ endif $(KDIR)$(DGEMMONCOPYOBJ) : $(KERNELDIR)/$(DGEMMONCOPY) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_ncopy.s + $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_ncopy.s m4 dgemm_ncopy.s > dgemm_ncopy_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_ncopy_nomacros.s -o $@ rm dgemm_ncopy.s dgemm_ncopy_nomacros.s @@ -484,7 +560,7 @@ $(KDIR)$(DGEMMINCOPYOBJ) : $(KERNELDIR)/$(DGEMMINCOPY) $(KDIR)$(DGEMMITCOPYOBJ) : $(KERNELDIR)/$(DGEMMITCOPY) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_itcopy.s + $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_itcopy.s m4 dgemm_itcopy.s > dgemm_itcopy_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_itcopy_nomacros.s -o $@ rm dgemm_itcopy.s dgemm_itcopy_nomacros.s @@ -527,7 +603,7 @@ $(KDIR)$(CGEMMINCOPYOBJ) : $(KERNELDIR)/$(CGEMMINCOPY) $(KDIR)$(CGEMMITCOPYOBJ) : $(KERNELDIR)/$(CGEMMITCOPY) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -E $< -o cgemm_itcopy.s + $(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -S $< -o - > cgemm_itcopy.s m4 cgemm_itcopy.s > cgemm_itcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX cgemm_itcopy_nomacros.s -o $@ rm cgemm_itcopy.s cgemm_itcopy_nomacros.s @@ -550,7 +626,7 @@ $(KDIR)$(ZGEMMINCOPYOBJ) : $(KERNELDIR)/$(ZGEMMINCOPY) $(KDIR)$(ZGEMMITCOPYOBJ) : $(KERNELDIR)/$(ZGEMMITCOPY) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o zgemm_itcopy.s + $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > zgemm_itcopy.s m4 zgemm_itcopy.s > zgemm_itcopy_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX zgemm_itcopy_nomacros.s -o $@ rm zgemm_itcopy.s zgemm_itcopy_nomacros.s @@ -582,7 +658,7 @@ endif $(KDIR)sgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemm_kernel$(TSUFFIX).s + $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemm_kernel$(TSUFFIX).s m4 sgemm_kernel$(TSUFFIX).s > sgemm_kernel$(TSUFFIX)_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemm_kernel$(TSUFFIX)_nomacros.s -o $@ rm sgemm_kernel$(TSUFFIX).s sgemm_kernel$(TSUFFIX)_nomacros.s @@ -590,9 +666,22 @@ else $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ endif +ifeq ($(BUILD_HALF), 1) + +$(KDIR)shgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX $< -o - > shgemm_kernel$(TSUFFIX).s + m4 shgemm_kernel$(TSUFFIX).s > shgemm_kernel$(TSUFFIX)_nomacros.s + $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemm_kernel$(TSUFFIX)_nomacros.s -o $@ + rm shgemm_kernel$(TSUFFIX).s shgemm_kernel$(TSUFFIX)_nomacros.s +else + $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ +endif +endif + $(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_kernel$(TSUFFIX).s + $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_kernel$(TSUFFIX).s m4 dgemm_kernel$(TSUFFIX).s > dgemm_kernel$(TSUFFIX)_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_kernel$(TSUFFIX)_nomacros.s -o $@ rm dgemm_kernel$(TSUFFIX).s dgemm_kernel$(TSUFFIX)_nomacros.s @@ -605,7 +694,7 @@ $(KDIR)qgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEP $(KDIR)cgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNN $< -o cgemm_kernel_n.s + $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DNN $< -o - > cgemm_kernel_n.s m4 cgemm_kernel_n.s > cgemm_kernel_n_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN cgemm_kernel_n_nomacros.s -o $@ rm cgemm_kernel_n.s cgemm_kernel_n_nomacros.s @@ -615,7 +704,7 @@ endif $(KDIR)cgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCN $< -o cgemm_kernel_l.s + $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DCN $< -o - > cgemm_kernel_l.s m4 cgemm_kernel_l.s > cgemm_kernel_l_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN cgemm_kernel_l_nomacros.s -o $@ rm cgemm_kernel_l.s cgemm_kernel_l_nomacros.s @@ -625,7 +714,7 @@ endif $(KDIR)cgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s + $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DNC $< -o - > cgemm_kernel_r.s m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@ rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s @@ -635,7 +724,7 @@ endif $(KDIR)cgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCC $< -o cgemm_kernel_b.s + $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DCC $< -o - > cgemm_kernel_b.s m4 cgemm_kernel_b.s > cgemm_kernel_b_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC cgemm_kernel_b_nomacros.s -o $@ rm cgemm_kernel_b.s cgemm_kernel_b_nomacros.s @@ -645,7 +734,7 @@ endif $(KDIR)zgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNN $< -o zgemm_kernel_n.s + $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DNN $< -o - > zgemm_kernel_n.s m4 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@ rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s @@ -655,7 +744,7 @@ endif $(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCN $< -o zgemm_kernel_l.s + $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DCN $< -o - > zgemm_kernel_l.s m4 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@ rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s @@ -665,7 +754,7 @@ endif $(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNC $< -o zgemm_kernel_r.s + $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DNC $< -o - > zgemm_kernel_r.s m4 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@ rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s @@ -675,7 +764,7 @@ endif $(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCC $< -o zgemm_kernel_b.s + $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DCC $< -o - > zgemm_kernel_b.s m4 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@ rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s @@ -699,7 +788,7 @@ $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD ifdef USE_TRMM $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o strmmkernel_ln.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o - > strmmkernel_ln.s m4 strmmkernel_ln.s > strmmkernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA strmmkernel_ln_nomacros.s -o $@ rm strmmkernel_ln.s strmmkernel_ln_nomacros.s @@ -709,7 +798,7 @@ endif $(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o strmmkernel_lt.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o - > strmmkernel_lt.s m4 strmmkernel_lt.s > strmmkernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA strmmkernel_lt_nomacros.s -o $@ rm strmmkernel_lt.s strmmkernel_lt_nomacros.s @@ -719,7 +808,7 @@ endif $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o strmmkernel_rn.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o - > strmmkernel_rn.s m4 strmmkernel_rn.s > strmmkernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA strmmkernel_rn_nomacros.s -o $@ rm strmmkernel_rn.s strmmkernel_rn_nomacros.s @@ -729,7 +818,7 @@ endif $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s @@ -739,7 +828,7 @@ endif $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o dtrmm_kernel_ln.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o - > dtrmm_kernel_ln.s m4 dtrmm_kernel_ln.s > dtrmm_kernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA dtrmm_kernel_ln_nomacros.s -o $@ rm dtrmm_kernel_ln.s dtrmm_kernel_ln_nomacros.s @@ -749,7 +838,7 @@ endif $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o dtrmm_kernel_lt.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o - > dtrmm_kernel_lt.s m4 dtrmm_kernel_lt.s > dtrmm_kernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA dtrmm_kernel_lt_nomacros.s -o $@ rm dtrmm_kernel_lt.s dtrmm_kernel_lt_nomacros.s @@ -759,7 +848,7 @@ endif $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o dtrmm_kernel_rn.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o - > dtrmm_kernel_rn.s m4 dtrmm_kernel_rn.s > dtrmm_kernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA dtrmm_kernel_rn_nomacros.s -o $@ rm dtrmm_kernel_rn.s dtrmm_kernel_rn_nomacros.s @@ -769,7 +858,7 @@ endif $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o dtrmm_kernel_rt.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > dtrmm_kernel_rt.s m4 dtrmm_kernel_rt.s > dtrmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA dtrmm_kernel_rt_nomacros.s -o $@ rm dtrmm_kernel_rt.s dtrmm_kernel_rt_nomacros.s @@ -791,7 +880,7 @@ $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_ln.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_ln.s m4 ctrmm_kernel_ln.s > ctrmm_kernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_ln_nomacros.s -o $@ rm ctrmm_kernel_ln.s ctrmm_kernel_ln_nomacros.s @@ -801,7 +890,7 @@ endif $(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_lt.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_lt.s m4 ctrmm_kernel_lt.s > ctrmm_kernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_lt_nomacros.s -o $@ rm ctrmm_kernel_lt.s ctrmm_kernel_lt_nomacros.s @@ -811,7 +900,7 @@ endif $(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lr.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o - > ctrmm_kernel_lr.s m4 ctrmm_kernel_lr.s > ctrmm_kernel_lr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ctrmm_kernel_lr_nomacros.s -o $@ rm ctrmm_kernel_lr.s ctrmm_kernel_lr_nomacros.s @@ -821,7 +910,7 @@ endif $(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lc.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o - > ctrmm_kernel_lc.s m4 ctrmm_kernel_lc.s > ctrmm_kernel_lc_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ctrmm_kernel_lc_nomacros.s -o $@ rm ctrmm_kernel_lc_nomacros.s ctrmm_kernel_lc.s @@ -831,7 +920,7 @@ endif $(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rn.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_rn.s m4 ctrmm_kernel_rn.s > ctrmm_kernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_rn_nomacros.s -o $@ rm ctrmm_kernel_rn.s ctrmm_kernel_rn_nomacros.s @@ -841,7 +930,7 @@ endif $(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rt.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_rt.s m4 ctrmm_kernel_rt.s > ctrmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_rt_nomacros.s -o $@ rm ctrmm_kernel_rt.s ctrmm_kernel_rt_nomacros.s @@ -851,7 +940,7 @@ endif $(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ctrmm_kernel_rr.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o - > ctrmm_kernel_rr.s m4 ctrmm_kernel_rr.s > ctrmm_kernel_rr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ctrmm_kernel_rr_nomacros.s -o $@ rm ctrmm_kernel_rr.s ctrmm_kernel_rr_nomacros.s @@ -861,7 +950,7 @@ endif $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ctrmm_kernel_RC.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o - > ctrmm_kernel_RC.s m4 ctrmm_kernel_RC.s > ctrmm_kernel_RC_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ctrmm_kernel_RC_nomacros.s -o $@ rm ctrmm_kernel_RC.s ctrmm_kernel_RC_nomacros.s @@ -871,7 +960,7 @@ endif $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_ln.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_ln.s m4 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@ rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s @@ -881,7 +970,7 @@ endif $(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_lt.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_lt.s m4 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@ rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s @@ -891,7 +980,7 @@ endif $(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lr.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o - > ztrmm_kernel_lr.s m4 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@ rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s @@ -901,7 +990,7 @@ endif $(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lc.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o - > ztrmm_kernel_lc.s m4 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@ rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s @@ -911,7 +1000,7 @@ endif $(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rn.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_rn.s m4 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@ rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s @@ -921,7 +1010,7 @@ endif $(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rt.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_rt.s m4 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@ rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s @@ -931,7 +1020,7 @@ endif $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rr.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o - > ztrmm_kernel_rr.s m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@ rm ztrmm_kernel_rr.s ztrmm_kernel_rr_nomacros.s @@ -941,7 +1030,7 @@ endif $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rc.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o - > ztrmm_kernel_rc.s m4 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@ rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s @@ -961,7 +1050,7 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s @@ -1095,7 +1184,7 @@ $(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DT $(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o dtrsm_kernel_lt.s + $(CC) $(CFLAGS) -S -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o - > dtrsm_kernel_lt.s m4 dtrsm_kernel_lt.s > dtrsm_kernel_lt_nomacros.s $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ dtrsm_kernel_lt_nomacros.s -o $@ rm dtrsm_kernel_lt.s dtrsm_kernel_lt_nomacros.s @@ -2206,6 +2295,11 @@ $(KDIR)xtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_ $(KDIR)sgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMM_BETA) $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ +ifeq ($(BUILD_HALF),1) +$(KDIR)shgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHGEMM_BETA) + $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ +endif + $(KDIR)dgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMM_BETA) $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ @@ -2221,6 +2315,24 @@ $(KDIR)zgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA) $(KDIR)xgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMM_BETA) $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@ + +ifeq ($(BUILD_HALF), 1) +$(SHGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMONCOPY) + $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ + +$(SHGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMOTCOPY) + $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ + +ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N)) +$(SHGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMINCOPY) + $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ + +$(SHGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMITCOPY) + $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ + +endif +endif + $(SGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMONCOPY) $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ @@ -2325,6 +2437,12 @@ endif endif + +ifeq ($(BUILD_HALF), 1) +$(KDIR)shgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND) + $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ +endif + $(KDIR)sgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ @@ -2342,7 +2460,7 @@ $(KDIR)cgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMM $(KDIR)cgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(PFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s + $(CC) $(PFLAGS) -S -UDOUBLE -DCOMPLEX -DNC $< -o - > cgemm_kernel_r.s m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@ rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s @@ -2388,7 +2506,7 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s m4 strmmkernel_rn.s > strmm_kernel_rt_nomacros.s $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s diff --git a/kernel/arm64/KERNEL.CORTEXA53 b/kernel/arm64/KERNEL.CORTEXA53 index c1d33fa3e..eba38a92e 100644 --- a/kernel/arm64/KERNEL.CORTEXA53 +++ b/kernel/arm64/KERNEL.CORTEXA53 @@ -1,3 +1,187 @@ -include $(KERNELDIR)/KERNEL.ARMV8 +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = axpy.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S +SASUMKERNEL = asum.S +DASUMKERNEL = asum.S +CASUMKERNEL = casum.S +ZASUMKERNEL = zasum.S + +SCOPYKERNEL = copy.S +DCOPYKERNEL = copy.S +CCOPYKERNEL = copy.S +ZCOPYKERNEL = copy.S + +SSWAPKERNEL = swap.S +DSWAPKERNEL = swap.S +CSWAPKERNEL = swap.S +ZSWAPKERNEL = swap.S + +ISAMAXKERNEL = iamax.S +IDAMAXKERNEL = iamax.S +ICAMAXKERNEL = izamax.S +IZAMAXKERNEL = izamax.S + +SNRM2KERNEL = nrm2.S +DNRM2KERNEL = nrm2.S +CNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S + +DDOTKERNEL = dot.S +SDOTKERNEL = dot.S +CDOTKERNEL = zdot.S +ZDOTKERNEL = zdot.S +DSDOTKERNEL = dot.S + +DGEMM_BETA = dgemm_beta.S +SGEMM_BETA = sgemm_beta.S + +ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8) +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S +else +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +endif +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +ifeq ($(SGEMM_UNROLL_M), 16) +SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S +else +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +endif +ifeq ($(SGEMM_UNROLL_M), 4) +SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S +else +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +endif +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S +SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) + +ifeq ($(DGEMM_UNROLL_M), 8) +DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S +DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S +else +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +endif + +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +ifeq ($(DGEMM_UNROLL_N), 4) +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S +else +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +endif + +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/arm64/daxpy_thunderx2t99.S b/kernel/arm64/daxpy_thunderx2t99.S index b8d0af5c2..baf39150f 100644 --- a/kernel/arm64/daxpy_thunderx2t99.S +++ b/kernel/arm64/daxpy_thunderx2t99.S @@ -98,11 +98,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add X, X, #128 .endm +/* + * No need to do software prefetches if the vector fits + * into L1 cache + */ +.macro KERNEL_F16_L1CACHE + ldp q4, q5, [X] + ldp q16, q17, [Y] + + ldp q6, q7, [X, #32] + ldp q18, q19, [Y, #32] + + fmla v16.2d, v4.2d, v0.d[0] + fmla v17.2d, v5.2d, v0.d[0] + + stp q16, q17, [Y] + + ldp q20, q21, [X, #64] + ldp q24, q25, [Y, #64] + + fmla v18.2d, v6.2d, v0.d[0] + fmla v19.2d, v7.2d, v0.d[0] + + stp q18, q19, [Y, #32] + + ldp q22, q23, [X, #96] + ldp q26, q27, [Y, #96] + + fmla v24.2d, v20.2d, v0.d[0] + fmla v25.2d, v21.2d, v0.d[0] + + stp q24, q25, [Y, #64] + + fmla v26.2d, v22.2d, v0.d[0] + fmla v27.2d, v23.2d, v0.d[0] + + stp q26, q27, [Y, #96] + + add Y, Y, #128 + add X, X, #128 +.endm + .macro KERNEL_F32 KERNEL_F16 KERNEL_F16 .endm + +.macro KERNEL_F32_L1CACHE + KERNEL_F16_L1CACHE + KERNEL_F16_L1CACHE +.endm + .macro INIT_S lsl INC_X, INC_X, #3 lsl INC_Y, INC_Y, #3 @@ -138,6 +185,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp I, xzr beq .Ldaxpy_kernel_F1 + cmp N, #2048 + ble .Ldaxpy_kernel_F32_L1CACHE + .align 5 .Ldaxpy_kernel_F32: @@ -145,6 +195,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. subs I, I, #1 bne .Ldaxpy_kernel_F32 + b .Ldaxpy_kernel_F1 + + .align 5 +.Ldaxpy_kernel_F32_L1CACHE: + + KERNEL_F32_L1CACHE + + subs I, I, #1 + bne .Ldaxpy_kernel_F32_L1CACHE .Ldaxpy_kernel_F1: diff --git a/kernel/arm64/sgemm_kernel_8x8_cortexa53.S b/kernel/arm64/sgemm_kernel_8x8_cortexa53.S new file mode 100644 index 000000000..628a928ca --- /dev/null +++ b/kernel/arm64/sgemm_kernel_8x8_cortexa53.S @@ -0,0 +1,2299 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc) */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define offset x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pA x15 +#define temp x16 + +#define alpha0 s10 +#define alphaV0 v10.s[0] +#define alpha1 s11 +#define alphaV1 v11.s[0] +#define alpha2 s14 +#define alphaV2 v14.s[0] +#define alpha3 s15 +#define alphaV3 v15.s[0] + +#define A_PRE_SIZE 640 +#define B_PRE_SIZE 224 +#define C_PRE_SIZE 96 + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pA +// 16 temp +// 17 +// 18 must save +// 19 must save +// 20 must save pA0_2, pA0_3 +// 21 must save pA0_6, pA0_7 +// 22 must save pA1_2, pA1_3 +// 23 must save pA1_6, pA1_7 +// 24 must save pB0_2, pB0_3 +// 25 must save pB0_6, pB0_7 +// 26 must save pB1_2, pB1_3 +// 27 must save pB1_6, pB1_7 +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA0_0, pA0_1, pA0_2, pA0_3 +//v01 pA0_4, pA0_5, pA0_6, pA0_7 +//v02 pA1_0, pA1_1, pA1_2, pA1_3 +//v03 pA1_4, pA1_5, pA1_6, pA1_7 +//v04 pB0_0, pB0_1, pB0_2, pB0_3 +//v05 pB0_4, pB0_5, pB0_6, pB0_7 +//v06 pB1_0, pB1_1, pB1_2, pB1_3 +//v07 pB1_4, pB1_5, pB1_6, pB1_7 +//v08 must save +//v09 must save +//v10 must save ALPHA0 +//v11 must save ALPHA1 +//v12 must save +//v13 must save +//v14 must save ALPHA2 +//v15 must save ALPHA3 +//v16 must save C00, C01, C02, C03 +//v17 must save C04, C05, C06, C07 +//v18 C08, C09, C10, C11 +//v19 C12, C13, C14, C15 +//v20 C16, C17, C18, C19 +//v21 C20, C21, C22, C23 +//v22 C24, C25, C26, C27 +//v23 C28, C29, C30, C31 +//v24 C32, C33, C34, C35 +//v25 C36, C37, C38, C39 +//v26 C40, C41, C42, C43 +//v27 C44, C45, C46, C47 +//v28 C48, C49, C50, C51 +//v29 C52, C53, C54, C55 +//v30 C56, C57, C58, C59 +//v31 C60, C61, C62, C63 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INIT8x8 + fmov s16, wzr + fmov s17, wzr + fmov s18, s16 + fmov s19, s17 + fmov s20, wzr + fmov s21, s16 + fmov s22, s17 + fmov s23, s18 + fmov s24, wzr + fmov s25, s16 + fmov s26, s17 + fmov s27, s18 + fmov s28, wzr + fmov s29, s16 + fmov s30, s17 + fmov s31, s18 +.endm + +.macro KERNEL8x8_I + ldp q0, q1, [pA], #32 + ldp q4, q5, [pB], #32 + + ldr d2, [pA], #8 + ldr d6, [pB], #8 + ldr d3, [pA, #8] + ldr d7, [pB, #8] + ldr x22, [pA], #16 + fmul v16.4s, v0.4s, v4.s[0] + ldr x26, [pB], #16 + fmul v17.4s, v1.4s, v4.s[0] + ldr x23, [pA], #8 + fmul v18.4s, v0.4s, v4.s[1] + ldr x27, [pB], #8 + fmul v19.4s, v1.4s, v4.s[1] + fmul v20.4s, v0.4s, v4.s[2] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmul v21.4s, v1.4s, v4.s[2] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmul v22.4s, v0.4s, v4.s[3] + fmul v23.4s, v1.4s, v4.s[3] + fmul v24.4s, v0.4s, v5.s[0] + fmul v25.4s, v1.4s, v5.s[0] + fmul v26.4s, v0.4s, v5.s[1] + fmul v27.4s, v1.4s, v5.s[1] + fmul v28.4s, v0.4s, v5.s[2] + fmul v29.4s, v1.4s, v5.s[2] + fmul v30.4s, v0.4s, v5.s[3] + fmul v31.4s, v1.4s, v5.s[3] +.endm + +.macro KERNEL8x8_M1 + ldr d2, [pA], #8 + fmov v0.d[1], x20 + ldr d6, [pB], #8 + fmov v4.d[1], x24 + ldr d3, [pA, #8] + fmov v1.d[1], x21 + ldr d7, [pB, #8] + fmov v5.d[1], x25 + fmla v16.4s, v0.4s, v4.s[0] + ldr x22, [pA], #16 + fmla v17.4s, v1.4s, v4.s[0] + ldr x26, [pB], #16 + fmla v18.4s, v0.4s, v4.s[1] + ldr x23, [pA], #8 + fmla v19.4s, v1.4s, v4.s[1] + ldr x27, [pB], #8 + fmla v20.4s, v0.4s, v4.s[2] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla v21.4s, v1.4s, v4.s[2] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla v22.4s, v0.4s, v4.s[3] + fmla v23.4s, v1.4s, v4.s[3] + fmla v24.4s, v0.4s, v5.s[0] + fmla v25.4s, v1.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v27.4s, v1.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v29.4s, v1.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] + fmla v31.4s, v1.4s, v5.s[3] +.endm + +.macro KERNEL8x8_M2 + ldr d0, [pA], #8 + fmov v2.d[1], x22 + ldr d4, [pB], #8 + fmov v6.d[1], x26 + ldr d1, [pA, #8] + fmov v3.d[1], x23 + ldr d5, [pB, #8] + fmov v7.d[1], x27 + fmla v16.4s, v2.4s, v6.s[0] + ldr x20, [pA], #16 + fmla v17.4s, v3.4s, v6.s[0] + ldr x24, [pB], #16 + fmla v18.4s, v2.4s, v6.s[1] + ldr x21, [pA], #8 + fmla v19.4s, v3.4s, v6.s[1] + ldr x25, [pB], #8 + fmla v20.4s, v2.4s, v6.s[2] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla v21.4s, v3.4s, v6.s[2] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla v22.4s, v2.4s, v6.s[3] + fmla v23.4s, v3.4s, v6.s[3] + fmla v24.4s, v2.4s, v7.s[0] + fmla v25.4s, v3.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v27.4s, v3.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v29.4s, v3.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] + fmla v31.4s, v3.4s, v7.s[3] +.endm + +.macro KERNEL8x8_E + fmov v2.d[1], x22 + fmov v6.d[1], x26 + fmov v3.d[1], x23 + fmov v7.d[1], x27 + fmla v16.4s, v2.4s, v6.s[0] + fmla v17.4s, v3.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v19.4s, v3.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla v21.4s, v3.4s, v6.s[2] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla v22.4s, v2.4s, v6.s[3] + fmla v23.4s, v3.4s, v6.s[3] + fmla v24.4s, v2.4s, v7.s[0] + fmla v25.4s, v3.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v27.4s, v3.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v29.4s, v3.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] + fmla v31.4s, v3.4s, v7.s[3] +.endm + +.macro KERNEL8x8_SUB + ldp q0, q1, [pA], #32 + ldp q4, q5, [pB], #32 + + fmla v16.4s, v0.4s, v4.s[0] + fmla v17.4s, v1.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v19.4s, v1.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla v21.4s, v1.4s, v4.s[2] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla v22.4s, v0.4s, v4.s[3] + fmla v23.4s, v1.4s, v4.s[3] + fmla v24.4s, v0.4s, v5.s[0] + fmla v25.4s, v1.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v27.4s, v1.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v29.4s, v1.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] + fmla v31.4s, v1.4s, v5.s[3] +.endm + +.macro SAVE8x8 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + add pCRow1, pCRow0, LDC + + ldp q0, q1, [pCRow0] + fmla v0.4s, v16.4s, alphaV0 + fmla v1.4s, v17.4s, alphaV1 + stp q0, q1, [pCRow0] + + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + add pCRow2, pCRow1, LDC + + ldp q2, q3, [pCRow1] + fmla v2.4s, v18.4s, alphaV2 + fmla v3.4s, v19.4s, alphaV3 + stp q2, q3, [pCRow1] + + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + add pCRow1, pCRow2, LDC + + ldp q4, q5, [pCRow2] + fmla v4.4s, v20.4s, alphaV0 + fmla v5.4s, v21.4s, alphaV1 + stp q4, q5, [pCRow2] + + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + add pCRow2, pCRow1, LDC + + ldp q6, q7, [pCRow1] + fmla v6.4s, v22.4s, alphaV2 + fmla v7.4s, v23.4s, alphaV3 + stp q6, q7, [pCRow1] + + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + add pCRow1, pCRow2, LDC + + ldp q0, q1, [pCRow2] + fmla v0.4s, v24.4s, alphaV0 + fmla v1.4s, v25.4s, alphaV1 + stp q0, q1, [pCRow2] + + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + add pCRow2, pCRow1, LDC + + ldp q2, q3, [pCRow1] + fmla v2.4s, v26.4s, alphaV2 + fmla v3.4s, v27.4s, alphaV3 + stp q2, q3, [pCRow1] + + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + add pCRow1, pCRow2, LDC + + ldp q4, q5, [pCRow2] + fmla v4.4s, v28.4s, alphaV0 + fmla v5.4s, v29.4s, alphaV1 + stp q4, q5, [pCRow2] + + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ldp q6, q7, [pCRow1] + fmla v6.4s, v30.4s, alphaV2 + fmla v7.4s, v31.4s, alphaV3 + stp q6, q7, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT4x8 + fmov s16, wzr + fmov s18, wzr + fmov s20, wzr + fmov s22, s16 + fmov s24, wzr + fmov s26, s16 + fmov s28, s18 + fmov s30, s20 +.endm + +.macro KERNEL4x8_I + ldr q0, [pA], #16 + ldp q4, q5, [pB], #32 + + ldr d2, [pA], #8 + ldr d6, [pB], #8 + ldr d7, [pB, #8] + ldr x22, [pA], #8 + fmul v16.4s, v0.4s, v4.s[0] + ldr x26, [pB], #16 + fmul v18.4s, v0.4s, v4.s[1] + ldr x27, [pB], #8 + fmul v20.4s, v0.4s, v4.s[2] + fmul v22.4s, v0.4s, v4.s[3] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmul v24.4s, v0.4s, v5.s[0] + fmul v26.4s, v0.4s, v5.s[1] + fmul v28.4s, v0.4s, v5.s[2] + fmul v30.4s, v0.4s, v5.s[3] +.endm + +.macro KERNEL4x8_M1 + ldr d2, [pA], #8 + fmov v0.d[1], x20 + ldr d6, [pB], #8 + fmov v4.d[1], x24 + ldr d7, [pB, #8] + fmov v5.d[1], x25 + ldr x22, [pA], #8 + fmla v16.4s, v0.4s, v4.s[0] + ldr x26, [pB], #16 + fmla v18.4s, v0.4s, v4.s[1] + ldr x27, [pB], #8 + fmla v20.4s, v0.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla v24.4s, v0.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] +.endm + +.macro KERNEL4x8_M2 + ldr d0, [pA], #8 + fmov v2.d[1], x22 + ldr d4, [pB], #8 + fmov v6.d[1], x26 + ldr d5, [pB, #8] + fmov v7.d[1], x27 + ldr x20, [pA], #8 + fmla v16.4s, v2.4s, v6.s[0] + ldr x24, [pB], #16 + fmla v18.4s, v2.4s, v6.s[1] + ldr x25, [pB], #8 + fmla v20.4s, v2.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla v24.4s, v2.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] +.endm + +.macro KERNEL4x8_E + fmov v2.d[1], x22 + fmov v6.d[1], x26 + fmov v7.d[1], x27 + fmla v16.4s, v2.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla v24.4s, v2.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] +.endm + +.macro KERNEL4x8_SUB + ldr q0, [pA], #16 + ldp q4, q5, [pB], #32 + + fmla v16.4s, v0.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla v24.4s, v0.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] +.endm + +.macro SAVE4x8 + add pCRow1, pCRow0, LDC + + ldr q0, [pCRow0] + fmla v0.4s, v16.4s, alphaV0 + str q0, [pCRow0] + + add pCRow2, pCRow1, LDC + + ldr q2, [pCRow1] + fmla v2.4s, v18.4s, alphaV2 + str q2, [pCRow1] + + add pCRow1, pCRow2, LDC + + ldr q4, [pCRow2] + fmla v4.4s, v20.4s, alphaV0 + str q4, [pCRow2] + + add pCRow2, pCRow1, LDC + + ldr q6, [pCRow1] + fmla v6.4s, v22.4s, alphaV2 + str q6, [pCRow1] + + add pCRow1, pCRow2, LDC + + ldr q0, [pCRow2] + fmla v0.4s, v24.4s, alphaV0 + str q0, [pCRow2] + + add pCRow2, pCRow1, LDC + + ldr q2, [pCRow1] + fmla v2.4s, v26.4s, alphaV2 + str q2, [pCRow1] + + add pCRow1, pCRow2, LDC + + ldr q4, [pCRow2] + fmla v4.4s, v28.4s, alphaV0 + str q4, [pCRow2] + + ldr q6, [pCRow1] + fmla v6.4s, v30.4s, alphaV2 + str q6, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x8 + fmov s16, wzr + fmov s18, wzr + fmov s20, wzr + fmov s22, s16 + fmov s24, wzr + fmov s26, s16 + fmov s28, s18 + fmov s30, s20 +.endm + +.macro KERNEL2x8_SUB + ldr d0, [pA], #8 + ldp q4, q5, [pB], #32 + + fmla v16.2s, v0.2s, v4.s[0] + fmla v18.2s, v0.2s, v4.s[1] + fmla v20.2s, v0.2s, v4.s[2] + fmla v22.2s, v0.2s, v4.s[3] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla v24.2s, v0.2s, v5.s[0] + fmla v26.2s, v0.2s, v5.s[1] + fmla v28.2s, v0.2s, v5.s[2] + fmla v30.2s, v0.2s, v5.s[3] +.endm + +.macro SAVE2x8 + add pCRow1, pCRow0, LDC + + ldr d0, [pCRow0] + fmla v0.2s, v16.2s, alphaV0 + str d0, [pCRow0] + + add pCRow2, pCRow1, LDC + + ldr d2, [pCRow1] + fmla v2.2s, v18.2s, alphaV2 + str d2, [pCRow1] + + add pCRow1, pCRow2, LDC + + ldr d4, [pCRow2] + fmla v4.2s, v20.2s, alphaV0 + str d4, [pCRow2] + + add pCRow2, pCRow1, LDC + + ldr d6, [pCRow1] + fmla v6.2s, v22.2s, alphaV2 + str d6, [pCRow1] + + add pCRow1, pCRow2, LDC + + ldr d0, [pCRow2] + fmla v0.2s, v24.2s, alphaV0 + str d0, [pCRow2] + + add pCRow2, pCRow1, LDC + + ldr d2, [pCRow1] + fmla v2.2s, v26.2s, alphaV2 + str d2, [pCRow1] + + add pCRow1, pCRow2, LDC + + ldr d4, [pCRow2] + fmla v4.2s, v28.2s, alphaV0 + str d4, [pCRow2] + + ldr d6, [pCRow1] + fmla v6.2s, v30.2s, alphaV2 + str d6, [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x8 + fmov s16, wzr + fmov s18, wzr + fmov s20, wzr + fmov s22, s16 + fmov s24, wzr + fmov s26, s16 + fmov s28, s18 + fmov s30, s20 +.endm + +.macro KERNEL1x8_SUB + ldp q4, q5, [pB], #32 + ldr s0, [pA], #4 + + fmla s16, s0, v4.s[0] + fmla s18, s0, v4.s[1] + fmla s20, s0, v4.s[2] + fmla s22, s0, v4.s[3] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla s24, s0, v5.s[0] + fmla s26, s0, v5.s[1] + fmla s28, s0, v5.s[2] + fmla s30, s0, v5.s[3] +.endm + +.macro SAVE1x8 + add pCRow1, pCRow0, LDC + + ldr s0, [pCRow0] + fmla s0, s16, alphaV0 + str s0, [pCRow0] + + add pCRow2, pCRow1, LDC + + ldr s2, [pCRow1] + fmla s2, s18, alphaV2 + str s2, [pCRow1] + + add pCRow1, pCRow2, LDC + + ldr s4, [pCRow2] + fmla s4, s20, alphaV0 + str s4, [pCRow2] + + add pCRow2, pCRow1, LDC + + ldr s6, [pCRow1] + fmla s6, s22, alphaV2 + str s6, [pCRow1] + + add pCRow1, pCRow2, LDC + + ldr s0, [pCRow2] + fmla s0, s24, alphaV0 + str s0, [pCRow2] + + add pCRow2, pCRow1, LDC + + ldr s2, [pCRow1] + fmla s2, s26, alphaV2 + str s2, [pCRow1] + + add pCRow1, pCRow2, LDC + + ldr s4, [pCRow2] + fmla s4, s28, alphaV0 + str s4, [pCRow2] + + ldr s6, [pCRow1] + fmla s6, s30, alphaV2 + str s6, [pCRow1] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************/ + +.macro INIT8x4 + fmov s16, wzr + fmov s17, wzr + fmov s18, wzr + fmov s19, s16 + fmov s20, wzr + fmov s21, s16 + fmov s22, wzr + fmov s23, s16 +.endm + +.macro KERNEL8x4_I + ldp q0, q1, [pA], #32 + ldr q4, [pB], #16 + + ldr d2, [pA], #8 + ldr d6, [pB], #8 + ldr d3, [pA, #8] + fmul v16.4s, v0.4s, v4.s[0] + ldr x22, [pA], #16 + fmul v17.4s, v1.4s, v4.s[0] + ldr x26, [pB], #8 + fmul v18.4s, v0.4s, v4.s[1] + ldr x23, [pA], #8 + fmul v19.4s, v1.4s, v4.s[1] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmul v20.4s, v0.4s, v4.s[2] + fmul v21.4s, v1.4s, v4.s[2] + fmul v22.4s, v0.4s, v4.s[3] + fmul v23.4s, v1.4s, v4.s[3] +.endm + +.macro KERNEL8x4_M1 + ldr d2, [pA], #8 + fmov v0.d[1], x20 + ldr d6, [pB], #8 + fmov v4.d[1], x24 + ldr d3, [pA, #8] + fmov v1.d[1], x21 + ldr x22, [pA], #16 + fmla v16.4s, v0.4s, v4.s[0] + ldr x26, [pB], #8 + fmla v17.4s, v1.4s, v4.s[0] + ldr x23, [pA], #8 + fmla v18.4s, v0.4s, v4.s[1] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla v19.4s, v1.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v21.4s, v1.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + fmla v23.4s, v1.4s, v4.s[3] +.endm + +.macro KERNEL8x4_M2 + ldr d0, [pA], #8 + fmov v2.d[1], x22 + ldr d4, [pB], #8 + fmov v6.d[1], x26 + ldr d1, [pA, #8] + fmov v3.d[1], x23 + ldr x20, [pA], #16 + fmla v16.4s, v2.4s, v6.s[0] + ldr x24, [pB], #8 + fmla v17.4s, v3.4s, v6.s[0] + ldr x21, [pA], #8 + fmla v18.4s, v2.4s, v6.s[1] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla v19.4s, v3.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v21.4s, v3.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + fmla v23.4s, v3.4s, v6.s[3] +.endm + +.macro KERNEL8x4_E + fmov v2.d[1], x22 + fmov v6.d[1], x26 + fmov v3.d[1], x23 + fmla v16.4s, v2.4s, v6.s[0] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla v17.4s, v3.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v19.4s, v3.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v21.4s, v3.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + fmla v23.4s, v3.4s, v6.s[3] +.endm + +.macro KERNEL8x4_SUB + ldp q0, q1, [pA], #32 + ldr q4, [pB], #16 + + fmla v16.4s, v0.4s, v4.s[0] + fmla v17.4s, v1.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v19.4s, v1.4s, v4.s[1] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla v20.4s, v0.4s, v4.s[2] + fmla v21.4s, v1.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + fmla v23.4s, v1.4s, v4.s[3] +.endm + +.macro SAVE8x4 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + add pCRow1, pCRow0, LDC + + ldp q0, q1, [pCRow0] + fmla v0.4s, v16.4s, alphaV0 + fmla v1.4s, v17.4s, alphaV1 + stp q0, q1, [pCRow0] + + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + add pCRow2, pCRow1, LDC + + ldp q4, q5, [pCRow1] + fmla v4.4s, v18.4s, alphaV0 + fmla v5.4s, v19.4s, alphaV1 + stp q4, q5, [pCRow1] + + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + add pCRow1, pCRow2, LDC + + ldp q0, q1, [pCRow2] + fmla v0.4s, v20.4s, alphaV0 + fmla v1.4s, v21.4s, alphaV1 + stp q0, q1, [pCRow2] + + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ldp q4, q5, [pCRow1] + fmla v4.4s, v22.4s, alphaV0 + fmla v5.4s, v23.4s, alphaV1 + stp q4, q5, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + + +.macro INIT4x4 + fmov s16, wzr + fmov s18, wzr + fmov s20, wzr + fmov s22, wzr +.endm + +.macro KERNEL4x4_I + ldr q0, [pA], #16 + ldr q4, [pB], #16 + + ldr d2, [pA], #8 + ldr d6, [pB], #8 + fmul v16.4s, v0.4s, v4.s[0] + ldr x22, [pA], #8 + fmul v18.4s, v0.4s, v4.s[1] + ldr x26, [pB], #8 + fmul v20.4s, v0.4s, v4.s[2] + fmul v22.4s, v0.4s, v4.s[3] +.endm + +.macro KERNEL4x4_M1 + ldr d2, [pA], #8 + fmov v0.d[1], x20 + ldr d6, [pB], #8 + fmov v4.d[1], x24 + ldr x22, [pA], #8 + ldr x26, [pB], #8 + fmla v16.4s, v0.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] +.endm + +.macro KERNEL4x4_M2 + ldr d0, [pA], #8 + fmov v2.d[1], x22 + ldr d4, [pB], #8 + fmov v6.d[1], x26 + ldr x20, [pA], #8 + ldr x24, [pB], #8 + fmla v16.4s, v2.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] +.endm + +.macro KERNEL4x4_E + fmov v2.d[1], x22 + fmov v6.d[1], x26 + fmla v16.4s, v2.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] +.endm + +.macro KERNEL4x4_SUB + ldr q0, [pA], #16 + ldr q4, [pB], #16 + + fmla v16.4s, v0.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] +.endm + +.macro SAVE4x4 + ldr q0, [pCRow0] + fmla v0.4s, v16.4s, alphaV0 + str q0, [pCRow0] + + add pCRow1, pCRow0, LDC + ldr q1, [pCRow1] + fmla v1.4s, v18.4s, alphaV2 + str q1, [pCRow1] + + add pCRow2, pCRow1, LDC + ldr q2, [pCRow2] + fmla v2.4s, v20.4s, alphaV0 + str q2, [pCRow2] + + add pCRow1, pCRow2, LDC + ldr q3, [pCRow1] + fmla v3.4s, v22.4s, alphaV2 + str q3, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x4 + fmov s16, wzr + fmov s18, wzr + fmov s20, wzr + fmov s22, s16 +.endm + +.macro KERNEL2x4_SUB + ldr d0, [pA], #8 + ldr q4, [pB], #16 + + fmla v16.2s, v0.2s, v4.s[0] + fmla v18.2s, v0.2s, v4.s[1] + fmla v20.2s, v0.2s, v4.s[2] + fmla v22.2s, v0.2s, v4.s[3] +.endm + +.macro SAVE2x4 + ldr d8, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + str d8, [pCRow0] + + add pCRow1, pCRow0, LDC + ldr d12, [pCRow1] + fmla v12.2s, v18.2s, alphaV1 + str d12, [pCRow1] + + add pCRow2, pCRow1, LDC + ldr d8, [pCRow2] + fmla v8.2s, v20.2s, alphaV2 + str d8, [pCRow2] + + add pCRow1, pCRow2, LDC + ldr d12, [pCRow1] + fmla v12.2s, v22.2s, alphaV3 + str d12, [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x4 + fmov s16, wzr + fmov s20, s16 +.endm + +.macro KERNEL1x4_SUB + ldr s0, [pA] + add pA, pA, #4 + + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + + fmla v16.2s, v8.2s, v0.s[0] + fmla v20.2s, v9.2s, v0.s[0] +.endm + +.macro SAVE1x4 + add pCRow1, pCRow0, LDC + ld1 {v8.s}[0], [pCRow0] + ld1 {v8.s}[1], [pCRow1] + fmla v8.2s, v16.2s, alphaV0 + st1 {v8.s}[0], [pCRow0] + st1 {v8.s}[1], [pCRow1] + + add pCRow2, pCRow1, LDC + add pCRow1, pCRow2, LDC + ld1 {v12.s}[0], [pCRow2] + ld1 {v12.s}[1], [pCRow1] + fmla v12.2s, v20.2s, alphaV1 + st1 {v12.s}[0], [pCRow2] + st1 {v12.s}[1], [pCRow1] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************/ + +.macro INIT8x2 + fmov s16, wzr + fmov s17, s16 + fmov s18, s17 + fmov s19, s16 +.endm + +.macro KERNEL8x2_SUB + ldp q0, q1, [pA], #32 + ldr d4, [pB], #8 + + fmla v16.4s, v0.4s, v4.s[0] + fmla v17.4s, v1.4s, v4.s[0] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla v18.4s, v0.4s, v4.s[1] + fmla v19.4s, v1.4s, v4.s[1] +.endm + +.macro SAVE8x2 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + add pCRow1, pCRow0, LDC + + ldp q0, q1, [pCRow0] + fmla v0.4s, v16.4s, alphaV0 + fmla v1.4s, v17.4s, alphaV1 + stp q0, q1, [pCRow0] + + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + add pCRow2, pCRow1, LDC + + ldp q4, q5, [pCRow1] + fmla v4.4s, v18.4s, alphaV0 + fmla v5.4s, v19.4s, alphaV1 + stp q4, q5, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT4x2 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 +.endm + +.macro KERNEL4x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + ld1 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v21.2s, v1.2s, v8.s[1] +.endm + +.macro SAVE4x2 + ld1 {v8.2s, v9.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + fmla v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] + + add pCRow1, pCRow0, LDC + ld1 {v12.2s, v13.2s}, [pCRow1] + fmla v12.2s, v20.2s, alphaV2 + fmla v13.2s, v21.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x2 + fmov s16, wzr + fmov s20, s16 +.endm + +.macro KERNEL2x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + + ld1 {v0.2s}, [pA] + add pA, pA, #8 + + fmla v16.2s, v0.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] +.endm + +.macro SAVE2x2 + ld1 {v8.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] + + add pCRow1 , pCRow0, LDC + ld1 {v12.2s}, [pCRow1] + fmla v12.2s, v20.2s, alphaV1 + st1 {v12.2s}, [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x2 + fmov s16, wzr +.endm + +.macro KERNEL1x2_SUB + ld1 {v8.2s} , [pB] + add pB , pB, #8 + + ldr s0 , [pA] + add pA, pA, #4 + + fmla v16.2s, v8.2s, v0.s[0] +.endm + +.macro SAVE1x2 + add pCRow1 , pCRow0, LDC + ld1 {v8.s}[0], [pCRow0] + ld1 {v8.s}[1], [pCRow1] + fmla v8.2s, v16.2s, alphaV0 + st1 {v8.s}[0], [pCRow0] + st1 {v8.s}[1], [pCRow1] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************/ + +.macro INIT8x1 + fmov s16, wzr + fmov s17, wzr +.endm + +.macro KERNEL8x1_SUB + ldr s4, [pB], #4 + ldp q0, q1, [pA], #32 + + fmla v16.4s, v0.4s, v4.s[0] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla v17.4s, v1.4s, v4.s[0] +.endm + +.macro SAVE8x1 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + ldp q0, q1, [pCRow0] + fmla v0.4s, v16.4s, alphaV0 + fmla v1.4s, v17.4s, alphaV1 + stp q0, q1, [pCRow0] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT4x1 + fmov s16, wzr + fmov s17, s16 +.endm + +.macro KERNEL4x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.2s, v1.2s}, [pA] + add pA , pA, #16 + + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] +.endm + +.macro SAVE4x1 + ld1 {v8.2s, v9.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + fmla v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x1 + fmov s16, wzr +.endm + +.macro KERNEL2x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.2s}, [pA] + add pA , pA, #8 + + fmla v16.2s, v0.2s, v8.s[0] +.endm + +.macro SAVE2x1 + ld1 {v8.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x1 + fmov s16, wzr +.endm + +.macro KERNEL1x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ldr s0, [pA] + add pA , pA, #4 + + fmadd s16, s0, s8, s16 +.endm + +.macro SAVE1x1 + ldr s8, [pCRow0] + fmla s8, s16, alphaV0 + str s8, [pCRow0] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + +.Lsgemm_kernel_begin: + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + fmov alpha0, s0 + fmov alpha1, s0 + fmov alpha2, s0 + fmov alpha3, s0 + + lsl LDC, LDC, #2 // ldc = ldc * 4 + + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #3 // J = J / 8 + cmp counterJ, #0 + ble .Lsgemm_kernel_L4_BEGIN + +/******************************************************************************/ +/******************************************************************************/ + +.Lsgemm_kernel_L8_BEGIN: + mov pCRow0, pC // pCRow0 = C + add pC, pC, LDC, lsl #3 + + mov pA, origPA // pA = start of A array + +/******************************************************************************/ + +.Lsgemm_kernel_L8_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble .Lsgemm_kernel_L8_M4_BEGIN + +.Lsgemm_kernel_L8_M8_20: + + mov pB, origPB + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #2 // is there at least 16 to do? + blt .Lsgemm_kernel_L8_M8_32 + + KERNEL8x8_I // do one in the K + KERNEL8x8_M2 // do another in the K + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + + subs counterL, counterL, #2 + ble .Lsgemm_kernel_L8_M8_22a + .align 5 + +.Lsgemm_kernel_L8_M8_22: + + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L8_M8_22 + +.Lsgemm_kernel_L8_M8_22a: + + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_E + + b .Lsgemm_kernel_L8_M8_44 + +.Lsgemm_kernel_L8_M8_32: + + tst counterL, #1 + ble .Lsgemm_kernel_L8_M8_40 + + KERNEL8x8_I + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_E + + b .Lsgemm_kernel_L8_M8_44 + +.Lsgemm_kernel_L8_M8_40: + + INIT8x8 + +.Lsgemm_kernel_L8_M8_44: + + ands counterL , origK, #7 + ble .Lsgemm_kernel_L8_M8_100 + +.Lsgemm_kernel_L8_M8_46: + + KERNEL8x8_SUB + + subs counterL, counterL, 1 + bgt .Lsgemm_kernel_L8_M8_46 + +.Lsgemm_kernel_L8_M8_100: + + SAVE8x8 + +.Lsgemm_kernel_L8_M8_END: + subs counterI, counterI, #1 + bne .Lsgemm_kernel_L8_M8_20 + +/******************************************************************************/ + +.Lsgemm_kernel_L8_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble .Lsgemm_kernel_L8_END + + tst counterI, #4 + ble .Lsgemm_kernel_L8_M2_BEGIN + +.Lsgemm_kernel_L8_M4_20: + + mov pB, origPB + + asr counterL , origK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt .Lsgemm_kernel_L8_M4_32 + + KERNEL4x8_I // do one in the K + KERNEL4x8_M2 // do another in the K + + subs counterL, counterL, #2 + ble .Lsgemm_kernel_L8_M4_22a + .align 5 + +.Lsgemm_kernel_L8_M4_22: + + KERNEL4x8_M1 + KERNEL4x8_M2 + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L8_M4_22 + +.Lsgemm_kernel_L8_M4_22a: + + KERNEL4x8_M1 + KERNEL4x8_E + + b .Lsgemm_kernel_L8_M4_44 + +.Lsgemm_kernel_L8_M4_32: + + tst counterL, #1 + ble .Lsgemm_kernel_L8_M4_40 + + KERNEL4x8_I + KERNEL4x8_E + + b .Lsgemm_kernel_L8_M4_44 + +.Lsgemm_kernel_L8_M4_40: + + INIT4x8 + +.Lsgemm_kernel_L8_M4_44: + + ands counterL , origK, #1 + ble .Lsgemm_kernel_L8_M4_100 + +.Lsgemm_kernel_L8_M4_46: + + KERNEL4x8_SUB + +.Lsgemm_kernel_L8_M4_100: + + SAVE4x8 + +.Lsgemm_kernel_L8_M4_END: + +/******************************************************************************/ + +.Lsgemm_kernel_L8_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble .Lsgemm_kernel_L8_END + + tst counterI, #2 // counterI = counterI / 2 + ble .Lsgemm_kernel_L8_M1_BEGIN + +.Lsgemm_kernel_L8_M2_20: + + INIT2x8 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lsgemm_kernel_L8_M2_40 + +.Lsgemm_kernel_L8_M2_22: + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L8_M2_22 + + +.Lsgemm_kernel_L8_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lsgemm_kernel_L8_M2_100 + +.Lsgemm_kernel_L8_M2_42: + + KERNEL2x8_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L8_M2_42 + +.Lsgemm_kernel_L8_M2_100: + + SAVE2x8 + +.Lsgemm_kernel_L8_M2_END: + +/******************************************************************************/ + +.Lsgemm_kernel_L8_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble .Lsgemm_kernel_L8_END + +.Lsgemm_kernel_L8_M1_20: + + INIT1x8 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lsgemm_kernel_L8_M1_40 + +.Lsgemm_kernel_L8_M1_22: + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L8_M1_22 + + +.Lsgemm_kernel_L8_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lsgemm_kernel_L8_M1_100 + +.Lsgemm_kernel_L8_M1_42: + + KERNEL1x8_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L8_M1_42 + +.Lsgemm_kernel_L8_M1_100: + + SAVE1x8 + +.Lsgemm_kernel_L8_END: + lsl temp, origK, #5 // B = B + K * 4 * 8 + add origPB, origPB, temp + + subs counterJ, counterJ , #1 // j-- + bgt .Lsgemm_kernel_L8_BEGIN + +/******************************************************************************/ +/******************************************************************************/ + +.Lsgemm_kernel_L4_BEGIN: + + mov counterJ , origN + tst counterJ , #7 + ble .Lsgemm_kernel_L999 + + tst counterJ , #4 + ble .Lsgemm_kernel_L2_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #2 + + mov pA, origPA // pA = A + +/******************************************************************************/ + +.Lsgemm_kernel_L4_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble .Lsgemm_kernel_L4_M4_BEGIN + +.Lsgemm_kernel_L4_M8_20: + + mov pB, origPB + + asr counterL , origK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt .Lsgemm_kernel_L4_M8_32 + + KERNEL8x4_I // do one in the K + KERNEL8x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble .Lsgemm_kernel_L4_M8_22a + .align 5 + +.Lsgemm_kernel_L4_M8_22: + + KERNEL8x4_M1 + KERNEL8x4_M2 + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L4_M8_22 + +.Lsgemm_kernel_L4_M8_22a: + + KERNEL8x4_M1 + KERNEL8x4_E + + b .Lsgemm_kernel_L4_M8_44 + +.Lsgemm_kernel_L4_M8_32: + + tst counterL, #1 + ble .Lsgemm_kernel_L4_M8_40 + + KERNEL8x4_I + KERNEL8x4_E + + b .Lsgemm_kernel_L4_M8_44 + +.Lsgemm_kernel_L4_M8_40: + + INIT8x4 + +.Lsgemm_kernel_L4_M8_44: + + ands counterL , origK, #1 + ble .Lsgemm_kernel_L4_M8_100 + +.Lsgemm_kernel_L4_M8_46: + + KERNEL8x4_SUB + +.Lsgemm_kernel_L4_M8_100: + + SAVE8x4 + +.Lsgemm_kernel_L4_M8_END: + subs counterI, counterI, #1 + bne .Lsgemm_kernel_L4_M8_20 + +/******************************************************************************/ + +.Lsgemm_kernel_L4_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble .Lsgemm_kernel_L4_END + + tst counterI, #4 + ble .Lsgemm_kernel_L4_M2_BEGIN + +.Lsgemm_kernel_L4_M4_20: + + mov pB, origPB + + asr counterL , origK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt .Lsgemm_kernel_L4_M4_32 + + KERNEL4x4_I // do one in the K + KERNEL4x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble .Lsgemm_kernel_L4_M4_22a + .align 5 + +.Lsgemm_kernel_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L4_M4_22 + +.Lsgemm_kernel_L4_M4_22a: + + KERNEL4x4_M1 + KERNEL4x4_E + + b .Lsgemm_kernel_L4_M4_44 + +.Lsgemm_kernel_L4_M4_32: + + tst counterL, #1 + ble .Lsgemm_kernel_L4_M4_40 + + KERNEL4x4_I + KERNEL4x4_E + + b .Lsgemm_kernel_L4_M4_44 + +.Lsgemm_kernel_L4_M4_40: + + INIT4x4 + +.Lsgemm_kernel_L4_M4_44: + + ands counterL , origK, #1 + ble .Lsgemm_kernel_L4_M4_100 + +.Lsgemm_kernel_L4_M4_46: + + KERNEL4x4_SUB + +.Lsgemm_kernel_L4_M4_100: + + SAVE4x4 + +.Lsgemm_kernel_L4_M4_END: + +/******************************************************************************/ + +.Lsgemm_kernel_L4_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble .Lsgemm_kernel_L4_END + + tst counterI, #2 // counterI = counterI / 2 + ble .Lsgemm_kernel_L4_M1_BEGIN + +.Lsgemm_kernel_L4_M2_20: + + INIT2x4 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lsgemm_kernel_L4_M2_40 + +.Lsgemm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L4_M2_22 + + +.Lsgemm_kernel_L4_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lsgemm_kernel_L4_M2_100 + +.Lsgemm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L4_M2_42 + +.Lsgemm_kernel_L4_M2_100: + + SAVE2x4 + +.Lsgemm_kernel_L4_M2_END: + +/******************************************************************************/ + +.Lsgemm_kernel_L4_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble .Lsgemm_kernel_L4_END + +.Lsgemm_kernel_L4_M1_20: + + INIT1x4 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lsgemm_kernel_L4_M1_40 + +.Lsgemm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L4_M1_22 + + +.Lsgemm_kernel_L4_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lsgemm_kernel_L4_M1_100 + +.Lsgemm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L4_M1_42 + +.Lsgemm_kernel_L4_M1_100: + + SAVE1x4 + +.Lsgemm_kernel_L4_END: + add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 + +/******************************************************************************/ +/******************************************************************************/ + +.Lsgemm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble .Lsgemm_kernel_L999 + + tst counterJ , #2 + ble .Lsgemm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #1 + + mov pA, origPA // pA = A + +/******************************************************************************/ + +.Lsgemm_kernel_L2_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI,#0 + ble .Lsgemm_kernel_L2_M4_BEGIN + +.Lsgemm_kernel_L2_M8_20: + + INIT8x2 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble .Lsgemm_kernel_L2_M8_40 + .align 5 + +.Lsgemm_kernel_L2_M8_22: + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L2_M8_22 + + +.Lsgemm_kernel_L2_M8_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lsgemm_kernel_L2_M8_100 + +.Lsgemm_kernel_L2_M8_42: + + KERNEL8x2_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L2_M8_42 + +.Lsgemm_kernel_L2_M8_100: + + SAVE8x2 + +.Lsgemm_kernel_L2_M8_END: + + subs counterI, counterI, #1 + bgt .Lsgemm_kernel_L2_M8_20 + +/******************************************************************************/ + +.Lsgemm_kernel_L2_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble .Lsgemm_kernel_L2_END + + tst counterI, #4 + ble .Lsgemm_kernel_L2_M2_BEGIN + +.Lsgemm_kernel_L2_M4_20: + + INIT4x2 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble .Lsgemm_kernel_L2_M4_40 + .align 5 + +.Lsgemm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L2_M4_22 + + +.Lsgemm_kernel_L2_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lsgemm_kernel_L2_M4_100 + +.Lsgemm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L2_M4_42 + +.Lsgemm_kernel_L2_M4_100: + + SAVE4x2 + +.Lsgemm_kernel_L2_M4_END: + +/******************************************************************************/ + +.Lsgemm_kernel_L2_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble .Lsgemm_kernel_L2_END + + tst counterI, #2 // counterI = counterI / 2 + ble .Lsgemm_kernel_L2_M1_BEGIN + +.Lsgemm_kernel_L2_M2_20: + + INIT2x2 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble .Lsgemm_kernel_L2_M2_40 + +.Lsgemm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L2_M2_22 + + +.Lsgemm_kernel_L2_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lsgemm_kernel_L2_M2_100 + +.Lsgemm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L2_M2_42 + +.Lsgemm_kernel_L2_M2_100: + + SAVE2x2 + +.Lsgemm_kernel_L2_M2_END: + +/******************************************************************************/ + +.Lsgemm_kernel_L2_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble .Lsgemm_kernel_L2_END + +.Lsgemm_kernel_L2_M1_20: + + INIT1x2 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble .Lsgemm_kernel_L2_M1_40 + +.Lsgemm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L2_M1_22 + + +.Lsgemm_kernel_L2_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lsgemm_kernel_L2_M1_100 + +.Lsgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L2_M1_42 + +.Lsgemm_kernel_L2_M1_100: + + SAVE1x2 + +.Lsgemm_kernel_L2_END: + + add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 + +/******************************************************************************/ +/******************************************************************************/ + +.Lsgemm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble .Lsgemm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + + mov pA, origPA // pA = A + +/******************************************************************************/ + +.Lsgemm_kernel_L1_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 + cmp counterI, #0 + ble .Lsgemm_kernel_L1_M4_BEGIN + +.Lsgemm_kernel_L1_M8_20: + + INIT8x1 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lsgemm_kernel_L1_M8_40 + .align 5 + +.Lsgemm_kernel_L1_M8_22: + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L1_M8_22 + + +.Lsgemm_kernel_L1_M8_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lsgemm_kernel_L1_M8_100 + +.Lsgemm_kernel_L1_M8_42: + + KERNEL8x1_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L1_M8_42 + +.Lsgemm_kernel_L1_M8_100: + + SAVE8x1 + +.Lsgemm_kernel_L1_M8_END: + + subs counterI, counterI, #1 + bgt .Lsgemm_kernel_L1_M8_20 + +/******************************************************************************/ + +.Lsgemm_kernel_L1_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble .Lsgemm_kernel_L1_END + + tst counterI, #4 + ble .Lsgemm_kernel_L1_M2_BEGIN + +.Lsgemm_kernel_L1_M4_20: + + INIT4x1 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lsgemm_kernel_L1_M4_40 + .align 5 + +.Lsgemm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L1_M4_22 + + +.Lsgemm_kernel_L1_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lsgemm_kernel_L1_M4_100 + +.Lsgemm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L1_M4_42 + +.Lsgemm_kernel_L1_M4_100: + + SAVE4x1 + +.Lsgemm_kernel_L1_M4_END: + +/******************************************************************************/ + +.Lsgemm_kernel_L1_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble .Lsgemm_kernel_L1_END + + tst counterI, #2 // counterI = counterI / 2 + ble .Lsgemm_kernel_L1_M1_BEGIN + +.Lsgemm_kernel_L1_M2_20: + + INIT2x1 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lsgemm_kernel_L1_M2_40 + +.Lsgemm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L1_M2_22 + + +.Lsgemm_kernel_L1_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lsgemm_kernel_L1_M2_100 + +.Lsgemm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L1_M2_42 + +.Lsgemm_kernel_L1_M2_100: + + SAVE2x1 + +.Lsgemm_kernel_L1_M2_END: + +/******************************************************************************/ + +.Lsgemm_kernel_L1_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble .Lsgemm_kernel_L1_END + +.Lsgemm_kernel_L1_M1_20: + + INIT1x1 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lsgemm_kernel_L1_M1_40 + +.Lsgemm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L1_M1_22 + + +.Lsgemm_kernel_L1_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lsgemm_kernel_L1_M1_100 + +.Lsgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L1_M1_42 + +.Lsgemm_kernel_L1_M1_100: + + SAVE1x1 + +.Lsgemm_kernel_L1_END: + +/******************************************************************************/ + +.Lsgemm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/sgemm_ncopy_8.S b/kernel/arm64/sgemm_ncopy_8.S new file mode 100644 index 000000000..f99b1d992 --- /dev/null +++ b/kernel/arm64/sgemm_ncopy_8.S @@ -0,0 +1,562 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER +#include "common.h" + +#define M x0 +#define N x1 +#define A00 x2 +#define LDA x3 +#define B00 x4 + +#define A01 x5 +#define A02 x6 +#define A03 x7 +#define A04 x8 +#define A05 x9 +#define A06 x10 +#define A07 x11 +#define A08 x12 + +#define I x13 +#define J x14 +#define K x15 + +#define TEMP1 x16 +#define TEMP2 x17 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro SAVE_REGS + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] +.endm + +.macro RESTORE_REGS + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) +.endm + +.macro COPY4x8 + ldr q0, [A01], #16 + ldr q1, [A02], #16 + ins v8.s[0], v0.s[0] + ins v10.s[0], v0.s[1] + ins v12.s[0], v0.s[2] + ins v14.s[0], v0.s[3] + ins v8.s[1], v1.s[0] + ins v10.s[1], v1.s[1] + ins v12.s[1], v1.s[2] + ins v14.s[1], v1.s[3] + + ldr q2, [A03], #16 + ldr q3, [A04], #16 + ins v8.s[2], v2.s[0] + ins v10.s[2], v2.s[1] + ins v12.s[2], v2.s[2] + ins v14.s[2], v2.s[3] + ins v8.s[3], v3.s[0] + ins v10.s[3], v3.s[1] + ins v12.s[3], v3.s[2] + ins v14.s[3], v3.s[3] + + ldr q4, [A05], #16 + ldr q5, [A06], #16 + ins v9.s[0], v4.s[0] + ins v11.s[0], v4.s[1] + ins v13.s[0], v4.s[2] + ins v15.s[0], v4.s[3] + ins v9.s[1], v5.s[0] + ins v11.s[1], v5.s[1] + ins v13.s[1], v5.s[2] + ins v15.s[1], v5.s[3] + + ldr q6, [A07], #16 + ldr q7, [A08], #16 + ins v9.s[2], v6.s[0] + ins v11.s[2], v6.s[1] + ins v13.s[2], v6.s[2] + ins v15.s[2], v6.s[3] + ins v9.s[3], v7.s[0] + ins v11.s[3], v7.s[1] + ins v13.s[3], v7.s[2] + ins v15.s[3], v7.s[3] + + st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00], #64 + st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [B00], #64 +.endm + +.macro COPY2x8 + ldr d0, [A01], #8 + ldr d1, [A02], #8 + ins v8.s[0], v0.s[0] + ins v10.s[0], v0.s[1] + ins v8.s[1], v1.s[0] + ins v10.s[1], v1.s[1] + + ldr d2, [A03], #8 + ldr d3, [A04], #8 + ins v8.s[2], v2.s[0] + ins v10.s[2], v2.s[1] + ins v8.s[3], v3.s[0] + ins v10.s[3], v3.s[1] + + ldr d4, [A05], #8 + ldr d5, [A06], #8 + ins v9.s[0], v4.s[0] + ins v11.s[0], v4.s[1] + ins v9.s[1], v5.s[0] + ins v11.s[1], v5.s[1] + + ldr d6, [A07], #8 + ldr d7, [A08], #8 + ins v9.s[2], v6.s[0] + ins v11.s[2], v6.s[1] + ins v9.s[3], v7.s[0] + ins v11.s[3], v7.s[1] + + st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00], #64 +.endm + +.macro COPY1x8 + ldr s0, [A01], #4 + ldr s1, [A02], #4 + ins v8.s[0], v0.s[0] + ins v8.s[1], v1.s[0] + + ldr s2, [A03], #4 + ldr s3, [A04], #4 + ins v8.s[2], v2.s[0] + ins v8.s[3], v3.s[0] + + ldr s4, [A05], #4 + ldr s5, [A06], #4 + ins v9.s[0], v4.s[0] + ins v9.s[1], v5.s[0] + + ldr s6, [A07], #4 + ldr s7, [A08], #4 + ins v9.s[2], v6.s[0] + ins v9.s[3], v7.s[0] + + st1 {v8.4s, v9.4s}, [B00], #32 +.endm + +.macro COPY4x4 + ldr q0, [A01], #16 + ldr q1, [A02], #16 + ins v8.s[0], v0.s[0] + ins v9.s[0], v0.s[1] + ins v10.s[0], v0.s[2] + ins v11.s[0], v0.s[3] + ins v8.s[1], v1.s[0] + ins v9.s[1], v1.s[1] + ins v10.s[1], v1.s[2] + ins v11.s[1], v1.s[3] + + ldr q2, [A03], #16 + ldr q3, [A04], #16 + ins v8.s[2], v2.s[0] + ins v9.s[2], v2.s[1] + ins v10.s[2], v2.s[2] + ins v11.s[2], v2.s[3] + ins v8.s[3], v3.s[0] + ins v9.s[3], v3.s[1] + ins v10.s[3], v3.s[2] + ins v11.s[3], v3.s[3] + + st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00], #64 +.endm + +.macro COPY2x4 + ldr d0, [A01], #8 + ldr d1, [A02], #8 + ins v8.s[0], v0.s[0] + ins v9.s[0], v0.s[1] + ins v8.s[1], v1.s[0] + ins v9.s[1], v1.s[1] + + ldr d2, [A03], #8 + ldr d3, [A04], #8 + ins v8.s[2], v2.s[0] + ins v9.s[2], v2.s[1] + ins v8.s[3], v3.s[0] + ins v9.s[3], v3.s[1] + + st1 {v8.4s, v9.4s}, [B00], #32 +.endm + +.macro COPY1x4 + ldr s0, [A01], #4 + ldr s1, [A02], #4 + ins v8.s[0], v0.s[0] + ins v8.s[1], v1.s[0] + + ldr s2, [A03], #4 + ldr s3, [A04], #4 + ins v8.s[2], v2.s[0] + ins v8.s[3], v3.s[0] + + st1 {v8.4s}, [B00], #16 +.endm + +.macro COPY4x2 + ldr q0, [A01], #16 + ldr q1, [A02], #16 + ins v8.s[0], v0.s[0] + ins v9.s[0], v0.s[1] + ins v10.s[0], v0.s[2] + ins v11.s[0], v0.s[3] + ins v8.s[1], v1.s[0] + ins v9.s[1], v1.s[1] + ins v10.s[1], v1.s[2] + ins v11.s[1], v1.s[3] + + st1 {v8.2s, v9.2s, v10.2s, v11.2s}, [B00], #32 +.endm + +.macro COPY2x2 + ldr d0, [A01], #8 + ldr d1, [A02], #8 + ins v8.s[0], v0.s[0] + ins v9.s[0], v0.s[1] + ins v8.s[1], v1.s[0] + ins v9.s[1], v1.s[1] + + st1 {v8.2s, v9.2s}, [B00], #16 +.endm + +.macro COPY1x2 + ldr s0, [A01], #4 + ldr s1, [A02], #4 + ins v8.s[0], v0.s[0] + ins v8.s[1], v1.s[0] + + st1 {v8.2s}, [B00], #8 +.endm + +.macro COPY1x1 + ldr s0, [A01], #4 + str s0, [B00], #4 +.endm + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + SAVE_REGS + + lsl LDA, LDA, #2 // LDA = LDA * SIZE + +.Lsgemm_ncopy_L8_BEGIN: + + asr J, N, #3 // J = N / 8 + cmp J, #0 + ble .Lsgemm_ncopy_L4_BEGIN + + .align 5 +.Lsgemm_ncopy_L8_M4_BEGIN: + + mov A01, A00 + add A02, A01, LDA + add A03, A02, LDA + add A04, A03, LDA + add A05, A04, LDA + add A06, A05, LDA + add A07, A06, LDA + add A08, A07, LDA + add A00, A08, LDA + + asr I, M, #2 // I = M / 4 + cmp I, #0 + ble .Lsgemm_ncopy_L8_M4_40 + + asr K, M, #4 // K = M / 16(cacheline) + mov TEMP1, A01 + + .align 5 +.Lsgemm_tcopy_L8_warnup_1: + + ldr s0, [TEMP1], #64 + + subs K, K, #1 + bgt .Lsgemm_tcopy_L8_warnup_1 + + asr K, M, #4 // K = M / 16(cacheline) + mov TEMP1, A02 + + .align 5 +.Lsgemm_tcopy_L8_warnup_2: + + ldr s0, [TEMP1], #64 + + subs K, K, #1 + bgt .Lsgemm_tcopy_L8_warnup_2 + + asr K, M, #4 // K = M / 16(cacheline) + mov TEMP1, A03 + + .align 5 +.Lsgemm_tcopy_L8_warnup_3: + + ldr s0, [TEMP1], #64 + + subs K, K, #1 + bgt .Lsgemm_tcopy_L8_warnup_3 + + asr K, M, #4 // K = M / 16(cacheline) + mov TEMP1, A04 + + .align 5 +.Lsgemm_tcopy_L8_warnup_4: + + ldr s0, [TEMP1], #64 + + subs K, K, #1 + bgt .Lsgemm_tcopy_L8_warnup_4 + + asr K, M, #4 // K = M / 16(cacheline) + mov TEMP1, A05 + + .align 5 +.Lsgemm_tcopy_L8_warnup_5: + + ldr s0, [TEMP1], #64 + + subs K, K, #1 + bgt .Lsgemm_tcopy_L8_warnup_5 + + asr K, M, #4 // K = M / 16(cacheline) + mov TEMP1, A06 + + .align 5 +.Lsgemm_tcopy_L8_warnup_6: + + ldr s0, [TEMP1], #64 + + subs K, K, #1 + bgt .Lsgemm_tcopy_L8_warnup_6 + + asr K, M, #4 // K = M / 16(cacheline) + mov TEMP1, A07 + + .align 5 +.Lsgemm_tcopy_L8_warnup_7: + + ldr s0, [TEMP1], #64 + + subs K, K, #1 + bgt .Lsgemm_tcopy_L8_warnup_7 + + asr K, M, #4 // K = M / 16(cacheline) + mov TEMP1, A08 + + .align 5 +.Lsgemm_tcopy_L8_warnup_8: + + ldr s0, [TEMP1], #64 + + subs K, K, #1 + bgt .Lsgemm_tcopy_L8_warnup_8 + + .align 5 +.Lsgemm_ncopy_L8_M4_20: + + COPY4x8 + + subs I, I, #1 + bne .Lsgemm_ncopy_L8_M4_20 + +.Lsgemm_ncopy_L8_M4_40: + + and I, M, #2 + cmp I, #0 + ble .Lsgemm_ncopy_L8_M4_60 + + COPY2x8 + +.Lsgemm_ncopy_L8_M4_60: + + and I, M, #1 + cmp I, #0 + ble .Lsgemm_ncopy_L8_M4_END + + COPY1x8 + +.Lsgemm_ncopy_L8_M4_END: + + subs J , J, #1 // j-- + bne .Lsgemm_ncopy_L8_M4_BEGIN + +/*********************************************************************************************/ + +.Lsgemm_ncopy_L4_BEGIN: + + tst N, #7 + ble .Lsgemm_ncopy_L999 + + tst N, #4 + ble .Lsgemm_ncopy_L2_BEGIN + +.Lsgemm_ncopy_L4_M4_BEGIN: + mov A01, A00 + add A02, A01, LDA + add A03, A02, LDA + add A04, A03, LDA + add A00, A04, LDA + + asr I, M, #2 // I = M / 4 + cmp I, #0 + ble .Lsgemm_ncopy_L4_M4_40 + + .align 5 +.Lsgemm_ncopy_L4_M4_20: + + COPY4x4 + + subs I, I, #1 + bne .Lsgemm_ncopy_L4_M4_20 + +.Lsgemm_ncopy_L4_M4_40: + + and I, M, #2 + cmp I, #0 + ble .Lsgemm_ncopy_L4_M4_60 + + COPY2x4 + +.Lsgemm_ncopy_L4_M4_60: + + and I, M, #1 + cmp I, #0 + ble .Lsgemm_ncopy_L4_M4_END + + COPY1x4 + +.Lsgemm_ncopy_L4_M4_END: + + +/*********************************************************************************************/ + +.Lsgemm_ncopy_L2_BEGIN: + + tst N, #2 + ble .Lsgemm_ncopy_L1_BEGIN + +.Lsgemm_ncopy_L2_M4_BEGIN: + + mov A01, A00 + add A02, A01, LDA + add A00, A02, LDA + + asr I, M, #2 // I = M / 4 + cmp I, #0 + ble .Lsgemm_ncopy_L2_M4_40 + + .align 5 +.Lsgemm_ncopy_L2_M4_20: + + COPY4x2 + + subs I , I , #1 + bne .Lsgemm_ncopy_L2_M4_20 + + +.Lsgemm_ncopy_L2_M4_40: + + and I, M, #2 + cmp I, #0 + ble .Lsgemm_ncopy_L2_M4_60 + + COPY2x2 + +.Lsgemm_ncopy_L2_M4_60: + + and I, M, #1 + cmp I, #0 + ble .Lsgemm_ncopy_L2_M4_END + + COPY1x2 + +.Lsgemm_ncopy_L2_M4_END: + +.Lsgemm_ncopy_L1_BEGIN: + + tst N, #1 + ble .Lsgemm_ncopy_L999 + +.Lsgemm_ncopy_L1_M1_BEGIN: + + mov A01, A00 + + mov I, M + cmp I, #0 + ble .Lsgemm_ncopy_L1_M1_END + + .align 5 +.Lsgemm_ncopy_L1_M1_20: + + COPY1x1 + + subs I, I, #1 + bne .Lsgemm_ncopy_L1_M1_20 + +.Lsgemm_ncopy_L1_M1_END: + +.Lsgemm_ncopy_L999: + + mov x0, #0 + RESTORE_REGS + ret + + EPILOGUE diff --git a/kernel/arm64/sgemm_tcopy_8.S b/kernel/arm64/sgemm_tcopy_8.S new file mode 100644 index 000000000..7d81ba266 --- /dev/null +++ b/kernel/arm64/sgemm_tcopy_8.S @@ -0,0 +1,707 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER +#include "common.h" + +#define M x0 +#define N x1 +#define A x2 +#define LDA x3 +#define B x4 + +#define M8 x5 + +#define A01 x6 +#define A02 x7 +#define A03 x8 +#define A04 x9 +#define A05 x10 +#define A06 x11 +#define A07 x12 +#define A08 x13 + +#define B01 x14 +#define B02 x15 +#define B03 x16 +#define B04 x17 +#define B00 x22 + + +#define I x18 +#define J x19 + +#define TEMP1 x20 + +#define A_PREFETCH 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ +.macro SAVE_REGS + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] +.endm + +.macro RESTORE_REGS + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) +.endm + +/*************************************************************************************************************************/ + +.macro COPY8x8 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + prfm PLDL1KEEP, [A03, #A_PREFETCH] + prfm PLDL1KEEP, [A04, #A_PREFETCH] + prfm PLDL1KEEP, [A05, #A_PREFETCH] + prfm PLDL1KEEP, [A06, #A_PREFETCH] + prfm PLDL1KEEP, [A07, #A_PREFETCH] + prfm PLDL1KEEP, [A08, #A_PREFETCH] + + ldp q0, q1, [A01] + ldp q2, q3, [A02] + add A01, A01, #32 + add A02, A02, #32 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] + add TEMP1, B00, #64 + + ldp q4, q5, [A03] + ldp q6, q7, [A04] + add A03, A03, #32 + add A04, A04, #32 + + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ldp q8, q9, [A05] + ldp q10, q11, [A06] + add A05, A05, #32 + add A06, A06, #32 + + st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ldp q12, q13, [A07] + ldp q14, q15, [A08] + add A07, A07, #32 + add A08, A08, #32 + + st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + add B00, B00, M8 +.endm + +.macro COPY4x8 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + //prfm PLDL1KEEP, [A05, #A_PREFETCH] + //prfm PLDL1KEEP, [A06, #A_PREFETCH] + //prfm PLDL1KEEP, [A07, #A_PREFETCH] + //prfm PLDL1KEEP, [A08, #A_PREFETCH] + + ldr q0, [A01] + ldr q1, [A02] + ldr q2, [A03] + ldr q3, [A04] + add A01, A01, #16 + add A02, A02, #16 + add A03, A03, #16 + add A04, A04, #16 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01] + add B01, B01, #64 + + ldr q4, [A05] + ldr q5, [A06] + ldr q6, [A07] + ldr q7, [A08] + + add A05, A05, #16 + add A06, A06, #16 + add A07, A07, #16 + add A08, A08, #16 + + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [B01] + add B01, B01, #64 +.endm + +.macro COPY2x8 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + //prfm PLDL1KEEP, [A05, #A_PREFETCH] + //prfm PLDL1KEEP, [A06, #A_PREFETCH] + //prfm PLDL1KEEP, [A07, #A_PREFETCH] + //prfm PLDL1KEEP, [A08, #A_PREFETCH] + + ldr d0, [A01] + ldr d1, [A02] + ldr d2, [A03] + ldr d3, [A04] + + add A01, A01, #8 + add A02, A02, #8 + add A03, A03, #8 + add A04, A04, #8 + + stp d0, d1, [B02] + add B02, B02, #16 + stp d2, d3, [B02] + add B02, B02, #16 + + ldr d4, [A05] + ldr d5, [A06] + ldr d6, [A07] + ldr d7, [A08] + + add A05, A05, #8 + add A06, A06, #8 + add A07, A07, #8 + add A08, A08, #8 + + stp d4, d5, [B02] + add B02, B02, #16 + stp d6, d7, [B02] + add B02, B02, #16 + +.endm + +.macro COPY1x8 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + //prfm PLDL1KEEP, [A05, #A_PREFETCH] + //prfm PLDL1KEEP, [A06, #A_PREFETCH] + //prfm PLDL1KEEP, [A07, #A_PREFETCH] + //prfm PLDL1KEEP, [A08, #A_PREFETCH] + + ldr s0, [A01] + ldr s1, [A02] + ldr s2, [A03] + ldr s3, [A04] + + add A01, A01, #4 + add A02, A02, #4 + add A03, A03, #4 + add A04, A04, #4 + + stp s0, s1, [B03] + add B03, B03, #8 + stp s2, s3, [B03] + add B03, B03, #8 + + ldr s4, [A05] + ldr s5, [A06] + ldr s6, [A07] + ldr s7, [A08] + + ldr d4, [A05], #8 + ldr d5, [A06], #8 + ldr d6, [A07], #8 + ldr d7, [A08], #8 + + stp s4, s5, [B03] + add B03, B03, #8 + stp s6, s7, [B03] + add B03, B03, #8 + +.endm + +/*************************************************************************************************************************/ + +.macro COPY8x4 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + prfm PLDL1KEEP, [A03, #A_PREFETCH] + prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ldp q0, q1, [A01] + ldp q2, q3, [A02] + add A01, A01, #32 + add A02, A02, #32 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] + add TEMP1, B00, #64 + + ldp q4, q5, [A03] + ldp q6, q7, [A04] + add A03, A03, #32 + add A04, A04, #32 + + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + add B00, B00, M8 +.endm + +.macro COPY4x4 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ldr q0, [A01] + ldr q1, [A02] + ldr q2, [A03] + ldr q3, [A04] + add A01, A01, #16 + add A02, A02, #16 + add A03, A03, #16 + add A04, A04, #16 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01] + + add B01, B01, #64 +.endm + +.macro COPY2x4 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ldr d0, [A01] + ldr d1, [A02] + ldr d2, [A03] + ldr d3, [A04] + + add A01, A01, #8 + add A02, A02, #8 + add A03, A03, #8 + add A04, A04, #8 + + stp d0, d1, [B02] + add B02, B02, #16 + stp d2, d3, [B02] + + add B02, B02, #16 +.endm + +.macro COPY1x4 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ldr s0, [A01] + ldr s1, [A02] + ldr s2, [A03] + ldr s3, [A04] + + add A01, A01, #4 + add A02, A02, #4 + add A03, A03, #4 + add A04, A04, #4 + + stp s0, s1, [B03] + add B03, B03, #8 + stp s2, s3, [B03] + add B03, B03, #8 + +.endm + +/*************************************************************************************************************************/ + +.macro COPY8x2 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ld1 {v0.4s, v1.4s}, [A01] + ld1 {v2.4s, v3.4s}, [A02] + add A01, A01, #32 + add A02, A02, #32 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] + add B00, B00, M8 +.endm + +.macro COPY4x2 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ldr q0, [A01] + ldr q1, [A02] + add A01, A01, #16 + add A02, A02, #16 + + stp q0, q1, [B01] + add B01, B01, #32 +.endm + +.macro COPY2x2 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ldr d0, [A01] + ldr d1, [A02] + + add A01, A01, #8 + add A02, A02, #8 + + stp d0, d1, [B02] + add B02, B02, #16 +.endm + +.macro COPY1x2 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ldr s0, [A01] + ldr s1, [A02] + + add A01, A01, #4 + add A02, A02, #4 + + stp s0, s1, [B03] + + add B03, B03, #8 +.endm + +/*************************************************************************************************************************/ + +.macro COPY8x1 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ldp q0, q1, [A01] + add A01, A01, #32 + stp q0, q1, [B00] + + add B00, B00, M8 +.endm + +.macro COPY4x1 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ldr q0, [A01] + add A01, A01, #16 + str q0, [B01] + + add B01, B01, #16 +.endm + +.macro COPY2x1 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ldr d0, [A01] + add A01, A01, #8 + str d0, [B02] + + add B02, B02, #8 +.endm + +.macro COPY1x1 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ldr s0, [A01] + add A01, A01, #4 + str s0, [B03] + + add B03, B03, #4 +.endm + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + SAVE_REGS + + lsl LDA, LDA, #2 // LDA = LDA * SIZE + + lsl TEMP1, M, #2 // TEMP1 = M * SIZE + + and B01 , N , #-8 + and B02 , N , #-4 + and B03 , N , #-2 + + mul B01, B01, TEMP1 + mul B02, B02, TEMP1 + mul B03, B03, TEMP1 + + add B01 , B01, B + add B02 , B02, B + add B03 , B03, B + + lsl M8, M, #5 // M8 = M * 8 * SIZE + +.Lsgemm_tcopy_L8_BEGIN: + + asr J, M, #3 // J = M / 8 + cmp J, #0 + ble .Lsgemm_tcopy_L4_BEGIN + + .align 5 +.Lsgemm_tcopy_L8_M8_BEGIN: + + mov A01, A + add A02, A01, LDA + add A03, A02, LDA + add A04, A03, LDA + add A05, A04, LDA + add A06, A05, LDA + add A07, A06, LDA + add A08, A07, LDA + add A, A08, LDA + + mov B00, B + add B, B00, #256 // B = B + 8 * 8 * SIZE + + asr I, N, #3 // I = N / 8 + cmp I, #0 + ble .Lsgemm_tcopy_L8_M8_40 + + .align 5 +.Lsgemm_tcopy_L8_M8_20: + + COPY8x8 + + subs I , I , #1 + bne .Lsgemm_tcopy_L8_M8_20 + +.Lsgemm_tcopy_L8_M8_40: + + tst N , #4 + ble .Lsgemm_tcopy_L8_M8_60 + + COPY4x8 + +.Lsgemm_tcopy_L8_M8_60: + + tst N , #2 + ble .Lsgemm_tcopy_L8_M8_80 + + COPY2x8 + +.Lsgemm_tcopy_L8_M8_80: + + tst N, #1 + ble .Lsgemm_tcopy_L8_M8_END + + COPY1x8 + +.Lsgemm_tcopy_L8_M8_END: + + subs J, J, #1 // j-- + bne .Lsgemm_tcopy_L8_M8_BEGIN + +/*********************************************************************************************/ + +.Lsgemm_tcopy_L4_BEGIN: + + tst M, #7 + ble .Lsgemm_tcopy_L999 + + tst M, #4 + ble .Lsgemm_tcopy_L2_BEGIN + +.Lsgemm_tcopy_L4_M8_BEGIN: + + mov A01, A + add A02, A01, LDA + add A03, A02, LDA + add A04, A03, LDA + add A, A04, LDA + + mov B00, B + add B, B00, #128 // B = B + 4 * 8 * SIZE + + asr I, N, #3 // I = N / 8 + cmp I, #0 + ble .Lsgemm_tcopy_L4_M8_40 + + .align 5 +.Lsgemm_tcopy_L4_M8_20: + + COPY8x4 + + subs I , I , #1 + bne .Lsgemm_tcopy_L4_M8_20 + +.Lsgemm_tcopy_L4_M8_40: + + tst N , #4 + ble .Lsgemm_tcopy_L4_M8_60 + + COPY4x4 + +.Lsgemm_tcopy_L4_M8_60: + + tst N , #2 + ble .Lsgemm_tcopy_L4_M8_80 + + COPY2x4 + +.Lsgemm_tcopy_L4_M8_80: + + tst N , #1 + ble .Lsgemm_tcopy_L4_M8_END + + COPY1x4 + + +.Lsgemm_tcopy_L4_M8_END: + +/*********************************************************************************************/ + +.Lsgemm_tcopy_L2_BEGIN: + + tst M, #3 + ble .Lsgemm_tcopy_L999 + + tst M, #2 + ble .Lsgemm_tcopy_L1_BEGIN + +.Lsgemm_tcopy_L2_M16_BEGIN: + + mov A01, A + add A02, A01, LDA + add A, A02, LDA + + mov B00, B + add B, B00, #64 // B = B + 2 * 8 * SIZE + + asr I, N, #3 // I = N / 8 + cmp I, #0 + ble .Lsgemm_tcopy_L2_M8_40 + + .align 5 +.Lsgemm_tcopy_L2_M8_20: + + COPY8x2 + + subs I , I , #1 + bne .Lsgemm_tcopy_L2_M8_20 + +.Lsgemm_tcopy_L2_M8_40: + + tst N , #4 + ble .Lsgemm_tcopy_L2_M8_60 + + COPY4x2 + +.Lsgemm_tcopy_L2_M8_60: + + tst N , #2 + ble .Lsgemm_tcopy_L2_M8_80 + + COPY2x2 + +.Lsgemm_tcopy_L2_M8_80: + + tst N , #1 + ble .Lsgemm_tcopy_L2_M8_END + + COPY1x2 + +.Lsgemm_tcopy_L2_M8_END: + +/*********************************************************************************************/ + +.Lsgemm_tcopy_L1_BEGIN: + + tst M, #1 + ble .Lsgemm_tcopy_L999 + + +.Lsgemm_tcopy_L1_M16_BEGIN: + + mov A01, A // A01 = A + mov B00, B + + asr I, N, #3 // I = M / 8 + cmp I, #0 + ble .Lsgemm_tcopy_L1_M8_40 + + .align 5 +.Lsgemm_tcopy_L1_M8_20: + + COPY8x1 + + subs I , I , #1 + bne .Lsgemm_tcopy_L1_M8_20 + +.Lsgemm_tcopy_L1_M8_40: + + tst N , #4 + ble .Lsgemm_tcopy_L1_M8_60 + + COPY4x1 + +.Lsgemm_tcopy_L1_M8_60: + + tst N , #2 + ble .Lsgemm_tcopy_L1_M8_80 + + COPY2x1 + +.Lsgemm_tcopy_L1_M8_80: + + tst N , #1 + ble .Lsgemm_tcopy_L1_M8_END + + COPY1x1 + + +.Lsgemm_tcopy_L1_M8_END: + +.Lsgemm_tcopy_L999: + + mov x0, #0 // set return value + RESTORE_REGS + ret + + EPILOGUE diff --git a/kernel/arm64/strmm_kernel_8x8_cortexa53.S b/kernel/arm64/strmm_kernel_8x8_cortexa53.S new file mode 100644 index 000000000..4b84623f3 --- /dev/null +++ b/kernel/arm64/strmm_kernel_8x8_cortexa53.S @@ -0,0 +1,2823 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 x7 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset) */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define offset x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pA x15 +#define temp x16 +#define tempOffset x17 +#define tempK x18 + +#define alpha0 s10 +#define alphaV0 v10.s[0] +#define alpha1 s11 +#define alphaV1 v11.s[0] +#define alpha2 s14 +#define alphaV2 v14.s[0] +#define alpha3 s15 +#define alphaV3 v15.s[0] + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pA +// 16 temp +// 17 tempOffset +// 18 must save tempK +// 19 must save +// 20 must save pA0_2, pA0_3 +// 21 must save pA0_6, pA0_7 +// 22 must save pA1_2, pA1_3 +// 23 must save pA1_6, pA1_7 +// 24 must save pB0_2, pB0_3 +// 25 must save pB0_6, pB0_7 +// 26 must save pB1_2, pB1_3 +// 27 must save pB1_6, pB1_7 +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA0_0, pA0_1, pA0_2, pA0_3 +//v01 pA0_4, pA0_5, pA0_6, pA0_7 +//v02 pA1_0, pA1_1, pA1_2, pA1_3 +//v03 pA1_4, pA1_5, pA1_6, pA1_7 +//v04 pB0_0, pB0_1, pB0_2, pB0_3 +//v05 pB0_4, pB0_5, pB0_6, pB0_7 +//v06 pB1_0, pB1_1, pB1_2, pB1_3 +//v07 pB1_4, pB1_5, pB1_6, pB1_7 +//v08 must save +//v09 must save +//v10 must save ALPHA0 +//v11 must save ALPHA1 +//v12 must save +//v13 must save +//v14 must save ALPHA2 +//v15 must save ALPHA3 +//v16 must save C00, C01, C02, C03 +//v17 must save C04, C05, C06, C07 +//v18 C08, C09, C10, C11 +//v19 C12, C13, C14, C15 +//v20 C16, C17, C18, C19 +//v21 C20, C21, C22, C23 +//v22 C24, C25, C26, C27 +//v23 C28, C29, C30, C31 +//v24 C32, C33, C34, C35 +//v25 C36, C37, C38, C39 +//v26 C40, C41, C42, C43 +//v27 C44, C45, C46, C47 +//v28 C48, C49, C50, C51 +//v29 C52, C53, C54, C55 +//v30 C56, C57, C58, C59 +//v31 C60, C61, C62, C63 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INIT8x8 + fmov s16, wzr + fmov s17, wzr + fmov s18, s16 + fmov s19, s17 + fmov s20, wzr + fmov s21, s16 + fmov s22, s17 + fmov s23, s18 + fmov s24, wzr + fmov s25, s16 + fmov s26, s17 + fmov s27, s18 + fmov s28, wzr + fmov s29, s16 + fmov s30, s17 + fmov s31, s18 +.endm + +.macro KERNEL8x8_I + ld1 {v0.4s, v1.4s}, [pA], #32 + ld1 {v4.4s, v5.4s}, [pB], #32 + ldr d2, [pA], #8 + ldr d6, [pB], #8 + ldr d3, [pA, #8] + ldr d7, [pB, #8] + + ldr x22, [pA], #16 + fmul v16.4s, v0.4s, v4.s[0] + ldr x26, [pB], #16 + fmul v17.4s, v1.4s, v4.s[0] + ldr x23, [pA], #8 + fmul v18.4s, v0.4s, v4.s[1] + ldr x27, [pB], #8 + fmul v19.4s, v1.4s, v4.s[1] + fmul v20.4s, v0.4s, v4.s[2] + fmul v21.4s, v1.4s, v4.s[2] + fmul v22.4s, v0.4s, v4.s[3] + fmul v23.4s, v1.4s, v4.s[3] + fmul v24.4s, v0.4s, v5.s[0] + fmul v25.4s, v1.4s, v5.s[0] + fmul v26.4s, v0.4s, v5.s[1] + fmul v27.4s, v1.4s, v5.s[1] + fmul v28.4s, v0.4s, v5.s[2] + fmul v29.4s, v1.4s, v5.s[2] + fmul v30.4s, v0.4s, v5.s[3] + fmul v31.4s, v1.4s, v5.s[3] +.endm + +.macro KERNEL8x8_M1 + ldr d2, [pA], #8 + fmov v0.d[1], x20 + ldr d6, [pB], #8 + fmov v4.d[1], x24 + ldr d3, [pA, #8] + fmov v1.d[1], x21 + ldr d7, [pB, #8] + fmov v5.d[1], x25 + fmla v16.4s, v0.4s, v4.s[0] + ldr x22, [pA], #16 + fmla v17.4s, v1.4s, v4.s[0] + ldr x26, [pB], #16 + fmla v18.4s, v0.4s, v4.s[1] + ldr x23, [pA], #8 + fmla v19.4s, v1.4s, v4.s[1] + ldr x27, [pB], #8 + fmla v20.4s, v0.4s, v4.s[2] + fmla v21.4s, v1.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + fmla v23.4s, v1.4s, v4.s[3] + fmla v24.4s, v0.4s, v5.s[0] + fmla v25.4s, v1.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v27.4s, v1.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v29.4s, v1.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] + fmla v31.4s, v1.4s, v5.s[3] +.endm + +.macro KERNEL8x8_M2 + ldr d0, [pA], #8 + fmov v2.d[1], x22 + ldr d4, [pB], #8 + fmov v6.d[1], x26 + ldr d1, [pA, #8] + fmov v3.d[1], x23 + ldr d5, [pB, #8] + fmov v7.d[1], x27 + fmla v16.4s, v2.4s, v6.s[0] + ldr x20, [pA], #16 + fmla v17.4s, v3.4s, v6.s[0] + ldr x24, [pB], #16 + fmla v18.4s, v2.4s, v6.s[1] + ldr x21, [pA], #8 + fmla v19.4s, v3.4s, v6.s[1] + ldr x25, [pB], #8 + fmla v20.4s, v2.4s, v6.s[2] + fmla v21.4s, v3.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + fmla v23.4s, v3.4s, v6.s[3] + fmla v24.4s, v2.4s, v7.s[0] + fmla v25.4s, v3.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v27.4s, v3.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v29.4s, v3.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] + fmla v31.4s, v3.4s, v7.s[3] +.endm + +.macro KERNEL8x8_E + fmov v2.d[1], x22 + fmov v6.d[1], x26 + fmov v3.d[1], x23 + fmov v7.d[1], x27 + fmla v16.4s, v2.4s, v6.s[0] + fmla v17.4s, v3.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v19.4s, v3.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v21.4s, v3.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + fmla v23.4s, v3.4s, v6.s[3] + fmla v24.4s, v2.4s, v7.s[0] + fmla v25.4s, v3.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v27.4s, v3.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v29.4s, v3.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] + fmla v31.4s, v3.4s, v7.s[3] +.endm + +.macro KERNEL8x8_SUB + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v4.s[0] + fmla v17.4s, v1.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v19.4s, v1.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v21.4s, v1.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + fmla v23.4s, v1.4s, v4.s[3] + fmla v24.4s, v0.4s, v5.s[0] + fmla v25.4s, v1.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v27.4s, v1.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v29.4s, v1.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] + fmla v31.4s, v1.4s, v5.s[3] +.endm + +.macro SAVE8x8 + add pCRow1, pCRow0, LDC + + fmul v0.4s, v16.4s, alphaV0 + fmul v1.4s, v17.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + fmul v2.4s, v18.4s, alphaV2 + fmul v3.4s, v19.4s, alphaV3 + st1 {v2.4s, v3.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + fmul v4.4s, v20.4s, alphaV0 + fmul v5.4s, v21.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + fmul v6.4s, v22.4s, alphaV2 + fmul v7.4s, v23.4s, alphaV3 + st1 {v6.4s, v7.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + fmul v0.4s, v24.4s, alphaV0 + fmul v1.4s, v25.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + fmul v2.4s, v26.4s, alphaV2 + fmul v3.4s, v27.4s, alphaV3 + st1 {v2.4s, v3.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + fmul v4.4s, v28.4s, alphaV0 + fmul v5.4s, v29.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow2] + + fmul v6.4s, v30.4s, alphaV2 + fmul v7.4s, v31.4s, alphaV3 + st1 {v6.4s, v7.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + + +.macro INIT4x8 + fmov s16, wzr + fmov s18, wzr + fmov s20, wzr + fmov s22, s16 + fmov s24, wzr + fmov s26, s16 + fmov s28, s18 + fmov s30, s20 +.endm + +.macro KERNEL4x8_I + ld1 {v0.4s}, [pA], #16 + ld1 {v4.4s, v5.4s}, [pB], #32 + + ldr d2, [pA], #8 + ldr d6, [pB], #8 + ldr d7, [pB, #8] + ldr x21, [pA], #8 + fmul v16.4s, v0.4s, v4.s[0] + ldr x26, [pB], #16 + fmul v18.4s, v0.4s, v4.s[1] + ldr x27, [pB], #8 + fmul v20.4s, v0.4s, v4.s[2] + fmul v22.4s, v0.4s, v4.s[3] + fmul v24.4s, v0.4s, v5.s[0] + fmul v26.4s, v0.4s, v5.s[1] + fmul v28.4s, v0.4s, v5.s[2] + fmul v30.4s, v0.4s, v5.s[3] +.endm + +.macro KERNEL4x8_M1 + ldr d2, [pA], #8 + fmov v0.d[1], x20 + ldr d6, [pB], #8 + fmov v4.d[1], x24 + ldr d7, [pB, #8] + fmov v5.d[1], x25 + fmla v16.4s, v0.4s, v4.s[0] + ldr x21, [pA], #8 + fmla v18.4s, v0.4s, v4.s[1] + ldr x26, [pB], #16 + fmla v20.4s, v0.4s, v4.s[2] + ldr x27, [pB], #8 + fmla v22.4s, v0.4s, v4.s[3] + fmla v24.4s, v0.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] +.endm + +.macro KERNEL4x8_M2 + ldr d0, [pA], #8 + fmov v2.d[1], x21 + ldr d4, [pB], #8 + fmov v6.d[1], x26 + ldr d5, [pB, #8] + fmov v7.d[1], x27 + fmla v16.4s, v2.4s, v6.s[0] + ldr x20, [pA], #8 + fmla v18.4s, v2.4s, v6.s[1] + ldr x24, [pB], #16 + fmla v20.4s, v2.4s, v6.s[2] + ldr x25, [pB], #8 + fmla v22.4s, v2.4s, v6.s[3] + fmla v24.4s, v2.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] +.endm + +.macro KERNEL4x8_E + fmov v2.d[1], x21 + fmov v6.d[1], x26 + fmov v7.d[1], x27 + fmla v16.4s, v2.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + fmla v24.4s, v2.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] +.endm + +.macro KERNEL4x8_SUB + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + fmla v24.4s, v0.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] +.endm + +.macro SAVE4x8 + add pCRow1, pCRow0, LDC + + + fmul v0.4s, v16.4s, alphaV0 + st1 {v0.4s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + + fmul v2.4s, v18.4s, alphaV2 + st1 {v2.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul v4.4s, v20.4s, alphaV0 + st1 {v4.4s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + + fmul v6.4s, v22.4s, alphaV2 + st1 {v6.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul v0.4s, v24.4s, alphaV0 + st1 {v0.4s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + + fmul v2.4s, v26.4s, alphaV2 + st1 {v2.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul v4.4s, v28.4s, alphaV0 + st1 {v4.4s}, [pCRow2] + + + fmul v6.4s, v30.4s, alphaV2 + st1 {v6.4s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x8 + fmov s16, wzr + fmov s18, wzr + fmov s20, wzr + fmov s22, s16 + fmov s24, wzr + fmov s26, s16 + fmov s28, s18 + fmov s30, s20 +.endm + +.macro KERNEL2x8_SUB + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ld1 {v0.2s}, [pA] + add pA, pA, #8 + + fmla v16.2s, v0.2s, v4.s[0] + fmla v18.2s, v0.2s, v4.s[1] + fmla v20.2s, v0.2s, v4.s[2] + fmla v22.2s, v0.2s, v4.s[3] + fmla v24.2s, v0.2s, v5.s[0] + fmla v26.2s, v0.2s, v5.s[1] + fmla v28.2s, v0.2s, v5.s[2] + fmla v30.2s, v0.2s, v5.s[3] +.endm + +.macro SAVE2x8 + add pCRow1, pCRow0, LDC + + + fmul v0.2s, v16.2s, alphaV0 + st1 {v0.2s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + + fmul v2.2s, v18.2s, alphaV2 + st1 {v2.2s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul v4.2s, v20.2s, alphaV0 + st1 {v4.2s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + + fmul v6.2s, v22.2s, alphaV2 + st1 {v6.2s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul v0.2s, v24.2s, alphaV0 + st1 {v0.2s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + + fmul v2.2s, v26.2s, alphaV2 + st1 {v2.2s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul v4.2s, v28.2s, alphaV0 + st1 {v4.2s}, [pCRow2] + + + fmul v6.2s, v30.2s, alphaV2 + st1 {v6.2s}, [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x8 + fmov s16, wzr + fmov s18, wzr + fmov s20, wzr + fmov s22, s16 + fmov s24, wzr + fmov s26, s16 + fmov s28, s18 + fmov s30, s20 +.endm + +.macro KERNEL1x8_SUB + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ldr s0, [pA] + add pA, pA, #4 + + fmla s16, s0, v4.s[0] + fmla s18, s0, v4.s[1] + fmla s20, s0, v4.s[2] + fmla s22, s0, v4.s[3] + fmla s24, s0, v5.s[0] + fmla s26, s0, v5.s[1] + fmla s28, s0, v5.s[2] + fmla s30, s0, v5.s[3] +.endm + +.macro SAVE1x8 + add pCRow1, pCRow0, LDC + + + fmul s0, s16, alphaV0 + str s0, [pCRow0] + + add pCRow2, pCRow1, LDC + + + fmul s2, s18, alphaV2 + str s2, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul s4, s20, alphaV0 + str s4, [pCRow2] + + add pCRow2, pCRow1, LDC + + + fmul s6, s22, alphaV2 + str s6, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul s0, s24, alphaV0 + str s0, [pCRow2] + + add pCRow2, pCRow1, LDC + + + fmul s2, s26, alphaV2 + str s2, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul s4, s28, alphaV0 + str s4, [pCRow2] + + + fmul s6, s30, alphaV2 + str s6, [pCRow1] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************/ + +.macro INIT8x4 + fmov s16, wzr + fmov s17, wzr + fmov s20, wzr + fmov s21, s16 + fmov s24, wzr + fmov s25, s16 + fmov s28, wzr + fmov s29, s16 +.endm + +.macro KERNEL8x4_I + ld1 {v8.4s}, [pB], #16 + ld1 {v0.4s, v1.4s}, [pA], #32 + + ldr d9, [pB], #8 + ldr d2, [pA], #8 + ldr d3, [pA, #8] + fmul v16.4s, v0.4s, v8.s[0] + ldr x25, [pB], #8 + fmul v17.4s, v1.4s, v8.s[0] + ldr x22, [pA], #16 + fmul v20.4s, v0.4s, v8.s[1] + ldr x23, [pA], #8 + fmul v21.4s, v1.4s, v8.s[1] + fmul v24.4s, v0.4s, v8.s[2] + fmul v25.4s, v1.4s, v8.s[2] + fmul v28.4s, v0.4s, v8.s[3] + fmul v29.4s, v1.4s, v8.s[3] +.endm + +.macro KERNEL8x4_M1 + ldr d9, [pB], #8 + fmov v8.d[1], x24 + ldr d2, [pA], #8 + fmov v0.d[1], x20 + ldr d3, [pA, #8] + fmov v1.d[1], x21 + fmla v16.4s, v0.4s, v8.s[0] + ldr x25, [pB], #8 + fmla v17.4s, v1.4s, v8.s[0] + ldr x22, [pA], #16 + fmla v20.4s, v0.4s, v8.s[1] + ldr x23, [pA], #8 + fmla v21.4s, v1.4s, v8.s[1] + fmla v24.4s, v0.4s, v8.s[2] + fmla v25.4s, v1.4s, v8.s[2] + fmla v28.4s, v0.4s, v8.s[3] + fmla v29.4s, v1.4s, v8.s[3] +.endm + +.macro KERNEL8x4_M2 + ldr d8, [pB], #8 + fmov v9.d[1], x25 + ldr d0, [pA], #8 + fmov v2.d[1], x22 + ldr d1, [pA, #8] + fmov v3.d[1], x23 + fmla v16.4s, v2.4s, v9.s[0] + ldr x24, [pB], #8 + fmla v17.4s, v3.4s, v9.s[0] + ldr x20, [pA], #16 + fmla v20.4s, v2.4s, v9.s[1] + ldr x21, [pA], #8 + fmla v21.4s, v3.4s, v9.s[1] + fmla v24.4s, v2.4s, v9.s[2] + fmla v25.4s, v3.4s, v9.s[2] + fmla v28.4s, v2.4s, v9.s[3] + fmla v29.4s, v3.4s, v9.s[3] +.endm + +.macro KERNEL8x4_E + fmov v9.d[1], x25 + fmov v2.d[1], x22 + fmov v3.d[1], x23 + fmla v16.4s, v2.4s, v9.s[0] + fmla v17.4s, v3.4s, v9.s[0] + fmla v20.4s, v2.4s, v9.s[1] + fmla v21.4s, v3.4s, v9.s[1] + fmla v24.4s, v2.4s, v9.s[2] + fmla v25.4s, v3.4s, v9.s[2] + fmla v28.4s, v2.4s, v9.s[3] + fmla v29.4s, v3.4s, v9.s[3] +.endm + +.macro KERNEL8x4_SUB + ld1 {v8.4s}, [pB], #16 + ld1 {v0.4s, v1.4s}, [pA], #32 + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] + fmla v24.4s, v0.4s, v8.s[2] + fmla v25.4s, v1.4s, v8.s[2] + fmla v28.4s, v0.4s, v8.s[3] + fmla v29.4s, v1.4s, v8.s[3] +.endm + +.macro SAVE8x4 + add pCRow1, pCRow0, LDC + + + fmul v0.4s, v16.4s, alphaV0 + fmul v1.4s, v17.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + + fmul v4.4s, v20.4s, alphaV0 + fmul v5.4s, v21.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul v0.4s, v24.4s, alphaV0 + fmul v1.4s, v25.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow2] + + + fmul v4.4s, v28.4s, alphaV0 + fmul v5.4s, v29.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + + +.macro INIT4x4 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 + fmov s24, s17 + fmov s25, s16 + fmov s28, s17 + fmov s29, s16 +.endm + +.macro KERNEL4x4_I + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + fmul v16.2s, v0.2s, v8.s[0] + fmul v29.2s, v1.2s, v9.s[1] + + fmul v20.2s, v0.2s, v8.s[1] + fmul v25.2s, v1.2s, v9.s[0] + + fmul v24.2s, v0.2s, v9.s[0] + fmul v21.2s, v1.2s, v8.s[1] + + fmul v28.2s, v0.2s, v9.s[1] + fmul v17.2s, v1.2s, v8.s[0] + + ld1 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + ld1 {v4.2s, v5.2s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL4x4_M1 + fmla v16.2s, v0.2s, v8.s[0] + fmla v29.2s, v1.2s, v9.s[1] + + ld1 {v12.2s, v13.2s}, [pB] // For next round + add pB, pB, #16 + + fmla v20.2s, v0.2s, v8.s[1] + fmla v25.2s, v1.2s, v9.s[0] + + ld1 {v4.2s, v5.2s}, [pA] // For next round + add pA, pA, #16 + + fmla v24.2s, v0.2s, v9.s[0] + fmla v21.2s, v1.2s, v8.s[1] + + prfm PLDL1KEEP, [pB, #512] + + fmla v28.2s, v0.2s, v9.s[1] + fmla v17.2s, v1.2s, v8.s[0] +.endm + +.macro KERNEL4x4_M2 + fmla v16.2s, v4.2s, v12.s[0] + fmla v29.2s, v5.2s, v13.s[1] + + ld1 {v8.2s, v9.2s}, [pB] // For next round + add pB, pB, #16 + + fmla v20.2s, v4.2s, v12.s[1] + fmla v25.2s, v5.2s, v13.s[0] + + ld1 {v0.2s, v1.2s}, [pA] // For next round + add pA, pA, #16 + + fmla v24.2s, v4.2s, v13.s[0] + fmla v21.2s, v5.2s, v12.s[1] + + prfm PLDL1KEEP, [pA, #512] + + fmla v28.2s, v4.2s, v13.s[1] + fmla v17.2s, v5.2s, v12.s[0] +.endm + +.macro KERNEL4x4_E + fmla v16.2s, v4.2s, v12.s[0] + fmla v29.2s, v5.2s, v13.s[1] + + fmla v20.2s, v4.2s, v12.s[1] + fmla v25.2s, v5.2s, v13.s[0] + + fmla v24.2s, v4.2s, v13.s[0] + fmla v21.2s, v5.2s, v12.s[1] + + fmla v28.2s, v4.2s, v13.s[1] + fmla v17.2s, v5.2s, v12.s[0] +.endm + +.macro KERNEL4x4_SUB + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + fmla v16.2s, v0.2s, v8.s[0] + fmla v29.2s, v1.2s, v9.s[1] + + fmla v20.2s, v0.2s, v8.s[1] + fmla v25.2s, v1.2s, v9.s[0] + + fmla v24.2s, v0.2s, v9.s[0] + fmla v21.2s, v1.2s, v8.s[1] + + fmla v28.2s, v0.2s, v9.s[1] + fmla v17.2s, v1.2s, v8.s[0] +.endm + +.macro SAVE4x4 + + fmul v8.2s, v16.2s, alphaV0 + fmul v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] + + add pCRow1, pCRow0, LDC + + fmul v12.2s, v20.2s, alphaV2 + fmul v13.2s, v21.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] + + add pCRow2, pCRow1, LDC + + fmul v8.2s, v24.2s, alphaV0 + fmul v9.2s, v25.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow2] + + add pCRow1, pCRow2, LDC + + fmul v12.2s, v28.2s, alphaV2 + fmul v13.2s, v29.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x4 + fmov s16, wzr + fmov s20, s16 + fmov s24, s20 + fmov s28, s16 +.endm + +.macro KERNEL2x4_SUB + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s}, [pA] + add pA, pA, #8 + + fmla v16.2s, v0.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v28.2s, v0.2s, v9.s[1] +.endm + +.macro SAVE2x4 + + fmul v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] + + add pCRow1, pCRow0, LDC + + fmul v12.2s, v20.2s, alphaV1 + st1 {v12.2s}, [pCRow1] + + add pCRow2, pCRow1, LDC + + fmul v8.2s, v24.2s, alphaV2 + st1 {v8.2s}, [pCRow2] + + add pCRow1, pCRow2, LDC + + fmul v12.2s, v28.2s, alphaV3 + st1 {v12.2s}, [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x4 + fmov s16, wzr + fmov s20, s16 +.endm + +.macro KERNEL1x4_SUB + ldr s0, [pA] + add pA, pA, #4 + + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + + fmla v16.2s, v8.2s, v0.s[0] + fmla v20.2s, v9.2s, v0.s[0] +.endm + +.macro SAVE1x4 + add pCRow1, pCRow0, LDC + + + fmul v8.2s, v16.2s, alphaV0 + st1 {v8.s}[0], [pCRow0] + st1 {v8.s}[1], [pCRow1] + + add pCRow2, pCRow1, LDC + add pCRow1, pCRow2, LDC + + + fmul v12.2s, v20.2s, alphaV1 + st1 {v12.s}[0], [pCRow2] + st1 {v12.s}[1], [pCRow1] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************/ + +.macro INIT8x2 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 +.endm + +.macro KERNEL8x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] +.endm + +.macro SAVE8x2 + add pCRow1, pCRow0, LDC + + + fmul v0.4s, v16.4s, alphaV0 + fmul v1.4s, v17.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + + fmul v4.4s, v20.4s, alphaV0 + fmul v5.4s, v21.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT4x2 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 +.endm + +.macro KERNEL4x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + ld1 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v21.2s, v1.2s, v8.s[1] +.endm + +.macro SAVE4x2 + + fmul v8.2s, v16.2s, alphaV0 + fmul v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] + + add pCRow1, pCRow0, LDC + + fmul v12.2s, v20.2s, alphaV2 + fmul v13.2s, v21.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x2 + fmov s16, wzr + fmov s20, s16 +.endm + +.macro KERNEL2x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + + ld1 {v0.2s}, [pA] + add pA, pA, #8 + + fmla v16.2s, v0.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] +.endm + +.macro SAVE2x2 + + fmul v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] + + add pCRow1 , pCRow0, LDC + + fmul v12.2s, v20.2s, alphaV1 + st1 {v12.2s}, [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x2 + fmov s16, wzr +.endm + +.macro KERNEL1x2_SUB + ld1 {v8.2s} , [pB] + add pB , pB, #8 + + ldr s0 , [pA] + add pA, pA, #4 + + fmla v16.2s, v8.2s, v0.s[0] +.endm + +.macro SAVE1x2 + add pCRow1 , pCRow0, LDC + + + fmul v8.2s, v16.2s, alphaV0 + st1 {v8.s}[0], [pCRow0] + st1 {v8.s}[1], [pCRow1] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************/ + +.macro INIT8x1 + fmov s16, wzr + fmov s17, wzr +.endm + +.macro KERNEL8x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] +.endm + +.macro SAVE8x1 + + fmul v0.4s, v16.4s, alphaV0 + fmul v1.4s, v17.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow0] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT4x1 + fmov s16, wzr + fmov s17, s16 +.endm + +.macro KERNEL4x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.2s, v1.2s}, [pA] + add pA , pA, #16 + + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] +.endm + +.macro SAVE4x1 + + fmul v8.2s, v16.2s, alphaV0 + fmul v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x1 + fmov s16, wzr +.endm + +.macro KERNEL2x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.2s}, [pA] + add pA , pA, #8 + + fmla v16.2s, v0.2s, v8.s[0] +.endm + +.macro SAVE2x1 + + fmul v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x1 + fmov s16, wzr +.endm + +.macro KERNEL1x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ldr s0, [pA] + add pA , pA, #4 + + fmadd s16, s0, s8, s16 +.endm + +.macro SAVE1x1 + + fmul s8, s16, alpha0 + str s8, [pCRow0] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + +.Lstrmm_kernel_begin: + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + fmov alpha0, s0 + fmov alpha1, s0 + fmov alpha2, s0 + fmov alpha3, s0 + + lsl LDC, LDC, #2 // ldc = ldc * 4 + +#if !defined(LEFT) + neg tempOffset, offset +#endif + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #3 // J = J / 8 + cmp counterJ, #0 + ble .Lstrmm_kernel_L4_BEGIN + +/******************************************************************************/ +/******************************************************************************/ + +.Lstrmm_kernel_L8_BEGIN: + mov pCRow0, pC // pCRow0 = C + add pC, pC, LDC, lsl #3 + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = start of A array + +/******************************************************************************/ + +.Lstrmm_kernel_L8_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble .Lstrmm_kernel_L8_M4_BEGIN + +.Lstrmm_kernel_L8_M8_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pA, pA, temp + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #8 +#else + add tempK, tempOffset, #8 +#endif + + asr counterL , tempK, #3 // L = K / 8 + cmp counterL , #2 // is there at least 16 to do? + blt .Lstrmm_kernel_L8_M8_32 + + KERNEL8x8_I // do one in the K + KERNEL8x8_M2 // do another in the K + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + + subs counterL, counterL, #2 + ble .Lstrmm_kernel_L8_M8_22a + .align 5 + +.Lstrmm_kernel_L8_M8_22: + + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L8_M8_22 + +.Lstrmm_kernel_L8_M8_22a: + + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_E + + b .Lstrmm_kernel_L8_M8_44 + +.Lstrmm_kernel_L8_M8_32: + + tst counterL, #1 + ble .Lstrmm_kernel_L8_M8_40 + + KERNEL8x8_I + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_E + + b .Lstrmm_kernel_L8_M8_44 + +.Lstrmm_kernel_L8_M8_40: + + INIT8x8 + +.Lstrmm_kernel_L8_M8_44: + + ands counterL , tempK, #7 + ble .Lstrmm_kernel_L8_M8_100 + +.Lstrmm_kernel_L8_M8_46: + + KERNEL8x8_SUB + + subs counterL, counterL, 1 + bgt .Lstrmm_kernel_L8_M8_46 + +.Lstrmm_kernel_L8_M8_100: + + SAVE8x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #8 +#else + sub tempK, tempK, #8 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #8 +#endif + +.Lstrmm_kernel_L8_M8_END: + subs counterI, counterI, #1 + bne .Lstrmm_kernel_L8_M8_20 + +/******************************************************************************/ + +.Lstrmm_kernel_L8_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble .Lstrmm_kernel_L8_END + + tst counterI, #4 + ble .Lstrmm_kernel_L8_M2_BEGIN + +.Lstrmm_kernel_L8_M4_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pA, pA, temp + lsl temp, tempOffset, #5 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #8 +#endif + + asr counterL , tempK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt .Lstrmm_kernel_L8_M4_32 + + KERNEL4x8_I // do one in the K + KERNEL4x8_M2 // do another in the K + + subs counterL, counterL, #2 + ble .Lstrmm_kernel_L8_M4_22a + .align 5 + +.Lstrmm_kernel_L8_M4_22: + + KERNEL4x8_M1 + KERNEL4x8_M2 + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L8_M4_22 + +.Lstrmm_kernel_L8_M4_22a: + + KERNEL4x8_M1 + KERNEL4x8_E + + b .Lstrmm_kernel_L8_M4_44 + +.Lstrmm_kernel_L8_M4_32: + + tst counterL, #1 + ble .Lstrmm_kernel_L8_M4_40 + + KERNEL4x8_I + KERNEL4x8_E + + b .Lstrmm_kernel_L8_M4_44 + +.Lstrmm_kernel_L8_M4_40: + + INIT4x8 + +.Lstrmm_kernel_L8_M4_44: + + ands counterL , tempK, #1 + ble .Lstrmm_kernel_L8_M4_100 + +.Lstrmm_kernel_L8_M4_46: + + KERNEL4x8_SUB + +.Lstrmm_kernel_L8_M4_100: + + SAVE4x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #8 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +.Lstrmm_kernel_L8_M4_END: + +/******************************************************************************/ + +.Lstrmm_kernel_L8_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble .Lstrmm_kernel_L8_END + + tst counterI, #2 // counterI = counterI / 2 + ble .Lstrmm_kernel_L8_M1_BEGIN + +.Lstrmm_kernel_L8_M2_20: + + INIT2x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pA, pA, temp + lsl temp, tempOffset, #5 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #8 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lstrmm_kernel_L8_M2_40 + +.Lstrmm_kernel_L8_M2_22: + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L8_M2_22 + + +.Lstrmm_kernel_L8_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lstrmm_kernel_L8_M2_100 + +.Lstrmm_kernel_L8_M2_42: + + KERNEL2x8_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L8_M2_42 + +.Lstrmm_kernel_L8_M2_100: + + SAVE2x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #8 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +.Lstrmm_kernel_L8_M2_END: + +/******************************************************************************/ + +.Lstrmm_kernel_L8_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble .Lstrmm_kernel_L8_END + +.Lstrmm_kernel_L8_M1_20: + + INIT1x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #2 + add pA, pA, temp + lsl temp, tempOffset, #5 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #8 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lstrmm_kernel_L8_M1_40 + +.Lstrmm_kernel_L8_M1_22: + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L8_M1_22 + + +.Lstrmm_kernel_L8_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lstrmm_kernel_L8_M1_100 + +.Lstrmm_kernel_L8_M1_42: + + KERNEL1x8_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L8_M1_42 + +.Lstrmm_kernel_L8_M1_100: + + SAVE1x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #8 +#endif + lsl temp, tempK, #2 + add pA, pA, temp + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif + +.Lstrmm_kernel_L8_END: + lsl temp, origK, #5 // B = B + K * 4 * 8 + add origPB, origPB, temp + +#if !defined(LEFT) + add tempOffset, tempOffset, #8 +#endif + + subs counterJ, counterJ , #1 // j-- + bgt .Lstrmm_kernel_L8_BEGIN + +/******************************************************************************/ +/******************************************************************************/ + +.Lstrmm_kernel_L4_BEGIN: + + mov counterJ , origN + tst counterJ , #7 + ble .Lstrmm_kernel_L999 + + tst counterJ , #4 + ble .Lstrmm_kernel_L2_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #2 + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = A + +/******************************************************************************/ + +.Lstrmm_kernel_L4_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble .Lstrmm_kernel_L4_M4_BEGIN + +.Lstrmm_kernel_L4_M8_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pA, pA, temp + lsl temp, tempOffset, #4 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #8 +#else + add tempK, tempOffset, #4 +#endif + + asr counterL , tempK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt .Lstrmm_kernel_L4_M8_32 + + KERNEL8x4_I // do one in the K + KERNEL8x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble .Lstrmm_kernel_L4_M8_22a + .align 5 + +.Lstrmm_kernel_L4_M8_22: + + KERNEL8x4_M1 + KERNEL8x4_M2 + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L4_M8_22 + +.Lstrmm_kernel_L4_M8_22a: + + KERNEL8x4_M1 + KERNEL8x4_E + + b .Lstrmm_kernel_L4_M8_44 + +.Lstrmm_kernel_L4_M8_32: + + tst counterL, #1 + ble .Lstrmm_kernel_L4_M8_40 + + KERNEL8x4_I + KERNEL8x4_E + + b .Lstrmm_kernel_L4_M8_44 + +.Lstrmm_kernel_L4_M8_40: + + INIT8x4 + +.Lstrmm_kernel_L4_M8_44: + + ands counterL , tempK, #1 + ble .Lstrmm_kernel_L4_M8_100 + +.Lstrmm_kernel_L4_M8_46: + + KERNEL8x4_SUB + +.Lstrmm_kernel_L4_M8_100: + + SAVE8x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #8 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #8 +#endif +.Lstrmm_kernel_L4_M8_END: + subs counterI, counterI, #1 + bne .Lstrmm_kernel_L4_M8_20 + +/******************************************************************************/ + +.Lstrmm_kernel_L4_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble .Lstrmm_kernel_L4_END + + tst counterI, #4 + ble .Lstrmm_kernel_L4_M2_BEGIN + +.Lstrmm_kernel_L4_M4_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #4 +#endif + asr counterL , tempK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt .Lstrmm_kernel_L4_M4_32 + + KERNEL4x4_I // do one in the K + KERNEL4x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble .Lstrmm_kernel_L4_M4_22a + .align 5 + +.Lstrmm_kernel_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L4_M4_22 + +.Lstrmm_kernel_L4_M4_22a: + + KERNEL4x4_M1 + KERNEL4x4_E + + b .Lstrmm_kernel_L4_M4_44 + +.Lstrmm_kernel_L4_M4_32: + + tst counterL, #1 + ble .Lstrmm_kernel_L4_M4_40 + + KERNEL4x4_I + KERNEL4x4_E + + b .Lstrmm_kernel_L4_M4_44 + +.Lstrmm_kernel_L4_M4_40: + + INIT4x4 + +.Lstrmm_kernel_L4_M4_44: + + ands counterL , tempK, #1 + ble .Lstrmm_kernel_L4_M4_100 + +.Lstrmm_kernel_L4_M4_46: + + KERNEL4x4_SUB + +.Lstrmm_kernel_L4_M4_100: + + SAVE4x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif +.Lstrmm_kernel_L4_M4_END: + +/******************************************************************************/ + +.Lstrmm_kernel_L4_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble .Lstrmm_kernel_L4_END + + tst counterI, #2 // counterI = counterI / 2 + ble .Lstrmm_kernel_L4_M1_BEGIN + +.Lstrmm_kernel_L4_M2_20: + + INIT2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pA, pA, temp + lsl temp, tempOffset, #4 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #4 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lstrmm_kernel_L4_M2_40 + +.Lstrmm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L4_M2_22 + + +.Lstrmm_kernel_L4_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lstrmm_kernel_L4_M2_100 + +.Lstrmm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L4_M2_42 + +.Lstrmm_kernel_L4_M2_100: + + SAVE2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif +.Lstrmm_kernel_L4_M2_END: + +/******************************************************************************/ + +.Lstrmm_kernel_L4_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble .Lstrmm_kernel_L4_END + +.Lstrmm_kernel_L4_M1_20: + + INIT1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + lsl temp, tempOffset, #2 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #4 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lstrmm_kernel_L4_M1_40 + +.Lstrmm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L4_M1_22 + + +.Lstrmm_kernel_L4_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lstrmm_kernel_L4_M1_100 + +.Lstrmm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L4_M1_42 + +.Lstrmm_kernel_L4_M1_100: + + SAVE1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #2 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif +.Lstrmm_kernel_L4_END: + add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 +#if !defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +/******************************************************************************/ +/******************************************************************************/ + +.Lstrmm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble .Lstrmm_kernel_L999 + + tst counterJ , #2 + ble .Lstrmm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #1 + +#if defined(LEFT) + mov tempOffset, offset +#endif + mov pA, origPA // pA = A + +/******************************************************************************/ + +.Lstrmm_kernel_L2_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI,#0 + ble .Lstrmm_kernel_L2_M4_BEGIN + +.Lstrmm_kernel_L2_M8_20: + + INIT8x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pA, pA, temp + lsl temp, tempOffset, #3 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #8 +#else + add tempK, tempOffset, #2 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble .Lstrmm_kernel_L2_M8_40 + .align 5 + +.Lstrmm_kernel_L2_M8_22: + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L2_M8_22 + + +.Lstrmm_kernel_L2_M8_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lstrmm_kernel_L2_M8_100 + +.Lstrmm_kernel_L2_M8_42: + + KERNEL8x2_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L2_M8_42 + +.Lstrmm_kernel_L2_M8_100: + + SAVE8x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #8 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #8 +#endif +.Lstrmm_kernel_L2_M8_END: + + subs counterI, counterI, #1 + bgt .Lstrmm_kernel_L2_M8_20 + +/******************************************************************************/ + +.Lstrmm_kernel_L2_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble .Lstrmm_kernel_L2_END + + tst counterI, #4 + ble .Lstrmm_kernel_L2_M2_BEGIN + +.Lstrmm_kernel_L2_M4_20: + + INIT4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #4 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #2 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble .Lstrmm_kernel_L2_M4_40 + .align 5 + +.Lstrmm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L2_M4_22 + + +.Lstrmm_kernel_L2_M4_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lstrmm_kernel_L2_M4_100 + +.Lstrmm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L2_M4_42 + +.Lstrmm_kernel_L2_M4_100: + + SAVE4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif +.Lstrmm_kernel_L2_M4_END: + +/******************************************************************************/ + +.Lstrmm_kernel_L2_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble .Lstrmm_kernel_L2_END + + tst counterI, #2 // counterI = counterI / 2 + ble .Lstrmm_kernel_L2_M1_BEGIN + +.Lstrmm_kernel_L2_M2_20: + + INIT2x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #2 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble .Lstrmm_kernel_L2_M2_40 + +.Lstrmm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L2_M2_22 + + +.Lstrmm_kernel_L2_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lstrmm_kernel_L2_M2_100 + +.Lstrmm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L2_M2_42 + +.Lstrmm_kernel_L2_M2_100: + + SAVE2x2 +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +.Lstrmm_kernel_L2_M2_END: + +/******************************************************************************/ + +.Lstrmm_kernel_L2_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble .Lstrmm_kernel_L2_END + +.Lstrmm_kernel_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #2 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #2 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble .Lstrmm_kernel_L2_M1_40 + +.Lstrmm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L2_M1_22 + + +.Lstrmm_kernel_L2_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lstrmm_kernel_L2_M1_100 + +.Lstrmm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L2_M1_42 + +.Lstrmm_kernel_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #2 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif +.Lstrmm_kernel_L2_END: +#if !defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 + +/******************************************************************************/ +/******************************************************************************/ + +.Lstrmm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble .Lstrmm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + +#if defined(LEFT) + mov tempOffset, offset +#endif + mov pA, origPA // pA = A + +/******************************************************************************/ + +.Lstrmm_kernel_L1_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 + cmp counterI, #0 + ble .Lstrmm_kernel_L1_M4_BEGIN + +.Lstrmm_kernel_L1_M8_20: + + INIT8x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pA, pA, temp + lsl temp, tempOffset, #2 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #8 +#else + add tempK, tempOffset, #1 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lstrmm_kernel_L1_M8_40 + .align 5 + +.Lstrmm_kernel_L1_M8_22: + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L1_M8_22 + + +.Lstrmm_kernel_L1_M8_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lstrmm_kernel_L1_M8_100 + +.Lstrmm_kernel_L1_M8_42: + + KERNEL8x1_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L1_M8_42 + +.Lstrmm_kernel_L1_M8_100: + + SAVE8x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #8 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #2 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #8 +#endif +.Lstrmm_kernel_L1_M8_END: + + subs counterI, counterI, #1 + bgt .Lstrmm_kernel_L1_M8_20 + +/******************************************************************************/ + +.Lstrmm_kernel_L1_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble .Lstrmm_kernel_L1_END + + tst counterI, #4 + ble .Lstrmm_kernel_L1_M2_BEGIN + +.Lstrmm_kernel_L1_M4_20: + + INIT4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #2 + add pB, pB, temp + lsl temp, tempOffset, #4 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #1 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lstrmm_kernel_L1_M4_40 + .align 5 + +.Lstrmm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L1_M4_22 + + +.Lstrmm_kernel_L1_M4_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lstrmm_kernel_L1_M4_100 + +.Lstrmm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L1_M4_42 + +.Lstrmm_kernel_L1_M4_100: + + SAVE4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #2 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif +.Lstrmm_kernel_L1_M4_END: + +/******************************************************************************/ + +.Lstrmm_kernel_L1_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble .Lstrmm_kernel_L1_END + + tst counterI, #2 // counterI = counterI / 2 + ble .Lstrmm_kernel_L1_M1_BEGIN + +.Lstrmm_kernel_L1_M2_20: + + INIT2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #2 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #1 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lstrmm_kernel_L1_M2_40 + +.Lstrmm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L1_M2_22 + + +.Lstrmm_kernel_L1_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lstrmm_kernel_L1_M2_100 + +.Lstrmm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L1_M2_42 + +.Lstrmm_kernel_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #2 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif +.Lstrmm_kernel_L1_M2_END: + +/******************************************************************************/ + +.Lstrmm_kernel_L1_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble .Lstrmm_kernel_L1_END + +.Lstrmm_kernel_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #2 + add pB, pB, temp + lsl temp, tempOffset, #2 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #1 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lstrmm_kernel_L1_M1_40 + +.Lstrmm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L1_M1_22 + + +.Lstrmm_kernel_L1_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lstrmm_kernel_L1_M1_100 + +.Lstrmm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L1_M1_42 + +.Lstrmm_kernel_L1_M1_100: + + SAVE1x1 + +.Lstrmm_kernel_L1_END: + +/******************************************************************************/ + +.Lstrmm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/common_param.h b/kernel/common_param.h deleted file mode 100644 index eab14b0a6..000000000 --- a/kernel/common_param.h +++ /dev/null @@ -1,1274 +0,0 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -#ifndef COMMON_PARAM_H -#define COMMON_PARAM_H - -#ifndef ASSEMBLER - -#ifdef DYNAMIC_ARCH - -typedef struct { - int dtb_entries; - int offsetA, offsetB, align; - - int sgemm_p, sgemm_q, sgemm_r; - int sgemm_unroll_m, sgemm_unroll_n, sgemm_unroll_mn; - - int exclusive_cache; - - float (*samax_k) (BLASLONG, float *, BLASLONG); - float (*samin_k) (BLASLONG, float *, BLASLONG); - float (*smax_k) (BLASLONG, float *, BLASLONG); - float (*smin_k) (BLASLONG, float *, BLASLONG); -BLASLONG (*isamax_k)(BLASLONG, float *, BLASLONG); -BLASLONG (*isamin_k)(BLASLONG, float *, BLASLONG); -BLASLONG (*ismax_k) (BLASLONG, float *, BLASLONG); -BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); - - float (*snrm2_k) (BLASLONG, float *, BLASLONG); - float (*sasum_k) (BLASLONG, float *, BLASLONG); - float (*ssum_k) (BLASLONG, float *, BLASLONG); - int (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - float (*sdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - - int (*srot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); - - int (*saxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - int (*sscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - int (*sswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - - int (*sgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*sgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*sger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - - int (*ssymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*ssymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - - int (*sgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); - int (*sgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - - int (*sgemm_incopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*sgemm_itcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*sgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*sgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); - - int (*strsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*strsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*strsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*strsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - - int (*strsm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - - int (*strmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*strmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*strmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*strmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - - int (*strmm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - - int (*ssymm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ssymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ssymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ssymm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - - int (*sneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*slaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); - - int dgemm_p, dgemm_q, dgemm_r; - int dgemm_unroll_m, dgemm_unroll_n, dgemm_unroll_mn; - - double (*damax_k) (BLASLONG, double *, BLASLONG); - double (*damin_k) (BLASLONG, double *, BLASLONG); - double (*dmax_k) (BLASLONG, double *, BLASLONG); - double (*dmin_k) (BLASLONG, double *, BLASLONG); -BLASLONG (*idamax_k)(BLASLONG, double *, BLASLONG); -BLASLONG (*idamin_k)(BLASLONG, double *, BLASLONG); -BLASLONG (*idmax_k) (BLASLONG, double *, BLASLONG); -BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); - - double (*dnrm2_k) (BLASLONG, double *, BLASLONG); - double (*dasum_k) (BLASLONG, double *, BLASLONG); - double (*dsum_k) (BLASLONG, double *, BLASLONG); - int (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); - double (*ddot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); - int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); - - int (*daxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); - int (*dscal_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); - int (*dswap_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); - - int (*dgemv_n) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*dgemv_t) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*dger_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - - int (*dsymv_L) (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*dsymv_U) (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - - int (*dgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG); - int (*dgemm_beta )(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); - - int (*dgemm_incopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*dgemm_itcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*dgemm_oncopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*dgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); - - int (*dtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*dtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*dtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*dtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); - - int (*dtrsm_iunucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_iunncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_iutucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_iutncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_ilnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_ilnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_iltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_iltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_ounucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_ounncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_outucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_outncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_olnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_olnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_oltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_oltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - - int (*dtrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*dtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*dtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*dtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); - - int (*dtrmm_iunucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_iunncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_iutucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_iutncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_ilnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_ilnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_iltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_iltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_ounucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_ounncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_outucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_outncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_olnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_olnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_oltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_oltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - - int (*dsymm_iutcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dsymm_iltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dsymm_outcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dsymm_oltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - - int (*dneg_tcopy) (BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*dlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, double *, BLASLONG, blasint *, double *); - -#ifdef EXPRECISION - - int qgemm_p, qgemm_q, qgemm_r; - int qgemm_unroll_m, qgemm_unroll_n, qgemm_unroll_mn; - - xdouble (*qamax_k) (BLASLONG, xdouble *, BLASLONG); - xdouble (*qamin_k) (BLASLONG, xdouble *, BLASLONG); - xdouble (*qmax_k) (BLASLONG, xdouble *, BLASLONG); - xdouble (*qmin_k) (BLASLONG, xdouble *, BLASLONG); -BLASLONG (*iqamax_k)(BLASLONG, xdouble *, BLASLONG); -BLASLONG (*iqamin_k)(BLASLONG, xdouble *, BLASLONG); -BLASLONG (*iqmax_k) (BLASLONG, xdouble *, BLASLONG); -BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG); - - xdouble (*qnrm2_k) (BLASLONG, xdouble *, BLASLONG); - xdouble (*qasum_k) (BLASLONG, xdouble *, BLASLONG); - xdouble (*qsum_k) (BLASLONG, xdouble *, BLASLONG); - int (*qcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - xdouble (*qdot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - int (*qrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble); - - int (*qaxpy_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - int (*qscal_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - int (*qswap_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - - int (*qgemv_n) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*qgemv_t) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*qger_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - - int (*qsymv_L) (BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*qsymv_U) (BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - - int (*qgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); - int (*qgemm_beta )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - - int (*qgemm_incopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*qgemm_itcopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*qgemm_oncopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*qgemm_otcopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - - int (*qtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*qtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*qtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*qtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - - int (*qtrsm_iunucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_iunncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_iutucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_iutncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_ilnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_ilnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_iltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_iltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_ounucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_ounncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_outucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_outncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_olnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_olnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_oltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_oltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - - int (*qtrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*qtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*qtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*qtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - - int (*qtrmm_iunucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_iunncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_iutucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_iutncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_ilnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_ilnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_iltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_iltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_ounucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_ounncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_outucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_outncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_olnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_olnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_oltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_oltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - - int (*qsymm_iutcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qsymm_iltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qsymm_outcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qsymm_oltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - - int (*qneg_tcopy) (BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*qlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, xdouble *, BLASLONG, blasint *, xdouble *); - -#endif - - int cgemm_p, cgemm_q, cgemm_r; - int cgemm_unroll_m, cgemm_unroll_n, cgemm_unroll_mn; - - float (*camax_k) (BLASLONG, float *, BLASLONG); - float (*camin_k) (BLASLONG, float *, BLASLONG); -BLASLONG (*icamax_k)(BLASLONG, float *, BLASLONG); -BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); - - float (*cnrm2_k) (BLASLONG, float *, BLASLONG); - float (*casum_k) (BLASLONG, float *, BLASLONG); - float (*csum_k) (BLASLONG, float *, BLASLONG); - int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - int (*csrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); - - int (*caxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - int (*caxpyc_k)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - int (*cscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - int (*cswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - - int (*cgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*cgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*cgemv_r) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*cgemv_c) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*cgemv_o) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*cgemv_u) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*cgemv_s) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*cgemv_d) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*cgeru_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*cgerc_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*cgerv_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*cgerd_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - - int (*csymv_L) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*csymv_U) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*chemv_L) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*chemv_U) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*chemv_M) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*chemv_V) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - - int (*cgemm_kernel_n )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); - int (*cgemm_kernel_l )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); - int (*cgemm_kernel_r )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); - int (*cgemm_kernel_b )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); - int (*cgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - - int (*cgemm_incopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*cgemm_itcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*cgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*cgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); - - int (*ctrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrsm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrsm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrsm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - - int (*ctrsm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - - int (*ctrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrmm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrmm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrmm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrmm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - - int (*ctrmm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - - int (*csymm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*csymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*csymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*csymm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - - int (*chemm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*chemm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*chemm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*chemm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - - int cgemm3m_p, cgemm3m_q, cgemm3m_r; - int cgemm3m_unroll_m, cgemm3m_unroll_n, cgemm3m_unroll_mn; - - int (*cgemm3m_kernel)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); - - int (*cgemm3m_incopyb)(BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*cgemm3m_incopyr)(BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*cgemm3m_incopyi)(BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*cgemm3m_itcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*cgemm3m_itcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*cgemm3m_itcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, float *); - - int (*cgemm3m_oncopyb)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); - int (*cgemm3m_oncopyr)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); - int (*cgemm3m_oncopyi)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); - int (*cgemm3m_otcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); - int (*cgemm3m_otcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); - int (*cgemm3m_otcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); - - int (*csymm3m_iucopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*csymm3m_ilcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*csymm3m_iucopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*csymm3m_ilcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*csymm3m_iucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*csymm3m_ilcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - - int (*csymm3m_oucopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - int (*csymm3m_olcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - int (*csymm3m_oucopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - int (*csymm3m_olcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - int (*csymm3m_oucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - int (*csymm3m_olcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - - int (*chemm3m_iucopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*chemm3m_ilcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*chemm3m_iucopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*chemm3m_ilcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*chemm3m_iucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*chemm3m_ilcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - - int (*chemm3m_oucopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - int (*chemm3m_olcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - int (*chemm3m_oucopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - int (*chemm3m_olcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - int (*chemm3m_oucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - int (*chemm3m_olcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - - int (*cneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*claswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); - - int zgemm_p, zgemm_q, zgemm_r; - int zgemm_unroll_m, zgemm_unroll_n, zgemm_unroll_mn; - - double (*zamax_k) (BLASLONG, double *, BLASLONG); - double (*zamin_k) (BLASLONG, double *, BLASLONG); -BLASLONG (*izamax_k)(BLASLONG, double *, BLASLONG); -BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG); - - double (*znrm2_k) (BLASLONG, double *, BLASLONG); - double (*zasum_k) (BLASLONG, double *, BLASLONG); - double (*zsum_k) (BLASLONG, double *, BLASLONG); - int (*zcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); - openblas_complex_double (*zdotu_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); - openblas_complex_double (*zdotc_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); - int (*zdrot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); - - int (*zaxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); - int (*zaxpyc_k)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); - int (*zscal_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); - int (*zswap_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); - - int (*zgemv_n) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zgemv_t) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zgemv_r) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zgemv_c) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zgemv_o) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zgemv_u) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zgemv_s) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zgemv_d) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zgeru_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zgerc_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zgerv_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zgerd_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - - int (*zsymv_L) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zsymv_U) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zhemv_L) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zhemv_U) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zhemv_M) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zhemv_V) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - - int (*zgemm_kernel_n )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); - int (*zgemm_kernel_l )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); - int (*zgemm_kernel_r )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); - int (*zgemm_kernel_b )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); - int (*zgemm_beta )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); - - int (*zgemm_incopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*zgemm_itcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*zgemm_oncopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*zgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); - - int (*ztrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrsm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrsm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrsm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - - int (*ztrsm_iunucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_iunncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_iutucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_iutncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_ilnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_ilnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_iltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_iltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_ounucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_ounncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_outucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_outncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_olnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_olnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_oltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_oltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - - int (*ztrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrmm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrmm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrmm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrmm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - - int (*ztrmm_iunucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_iunncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_iutucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_iutncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_ilnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_ilnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_iltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_iltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_ounucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_ounncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_outucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_outncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_olnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_olnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_oltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_oltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - - int (*zsymm_iutcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zsymm_iltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zsymm_outcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zsymm_oltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - - int (*zhemm_iutcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zhemm_iltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zhemm_outcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zhemm_oltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - - int zgemm3m_p, zgemm3m_q, zgemm3m_r; - int zgemm3m_unroll_m, zgemm3m_unroll_n, zgemm3m_unroll_mn; - - int (*zgemm3m_kernel)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); - - int (*zgemm3m_incopyb)(BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*zgemm3m_incopyr)(BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*zgemm3m_incopyi)(BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*zgemm3m_itcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*zgemm3m_itcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*zgemm3m_itcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, double *); - - int (*zgemm3m_oncopyb)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); - int (*zgemm3m_oncopyr)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); - int (*zgemm3m_oncopyi)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); - int (*zgemm3m_otcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); - int (*zgemm3m_otcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); - int (*zgemm3m_otcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); - - int (*zsymm3m_iucopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zsymm3m_ilcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zsymm3m_iucopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zsymm3m_ilcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zsymm3m_iucopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zsymm3m_ilcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - - int (*zsymm3m_oucopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - int (*zsymm3m_olcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - int (*zsymm3m_oucopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - int (*zsymm3m_olcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - int (*zsymm3m_oucopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - int (*zsymm3m_olcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - - int (*zhemm3m_iucopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zhemm3m_ilcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zhemm3m_iucopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zhemm3m_ilcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zhemm3m_iucopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zhemm3m_ilcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - - int (*zhemm3m_oucopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - int (*zhemm3m_olcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - int (*zhemm3m_oucopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - int (*zhemm3m_olcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - int (*zhemm3m_oucopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - int (*zhemm3m_olcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - - int (*zneg_tcopy) (BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*zlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, double *, BLASLONG, blasint *, double *); - -#ifdef EXPRECISION - - int xgemm_p, xgemm_q, xgemm_r; - int xgemm_unroll_m, xgemm_unroll_n, xgemm_unroll_mn; - - xdouble (*xamax_k) (BLASLONG, xdouble *, BLASLONG); - xdouble (*xamin_k) (BLASLONG, xdouble *, BLASLONG); -BLASLONG (*ixamax_k)(BLASLONG, xdouble *, BLASLONG); -BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); - - xdouble (*xnrm2_k) (BLASLONG, xdouble *, BLASLONG); - xdouble (*xasum_k) (BLASLONG, xdouble *, BLASLONG); - xdouble (*xsum_k) (BLASLONG, xdouble *, BLASLONG); - int (*xcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - openblas_complex_xdouble (*xdotu_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - openblas_complex_xdouble (*xdotc_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - int (*xqrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble); - - int (*xaxpy_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - int (*xaxpyc_k)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - int (*xscal_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - int (*xswap_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - - int (*xgemv_n) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemv_t) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemv_r) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemv_c) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemv_o) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemv_u) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemv_s) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemv_d) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgeru_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgerc_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgerv_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgerd_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - - int (*xsymv_L) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xsymv_U) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xhemv_L) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xhemv_U) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xhemv_M) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xhemv_V) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - - int (*xgemm_kernel_n )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); - int (*xgemm_kernel_l )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); - int (*xgemm_kernel_r )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); - int (*xgemm_kernel_b )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); - int (*xgemm_beta )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - - int (*xgemm_incopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemm_itcopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemm_oncopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemm_otcopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - - int (*xtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrsm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrsm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrsm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - - int (*xtrsm_iunucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_iunncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_iutucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_iutncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_ilnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_ilnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_iltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_iltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_ounucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_ounncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_outucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_outncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_olnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_olnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_oltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_oltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - - int (*xtrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrmm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrmm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrmm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrmm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - - int (*xtrmm_iunucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_iunncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_iutucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_iutncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_ilnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_ilnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_iltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_iltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_ounucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_ounncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_outucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_outncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_olnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_olnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_oltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_oltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - - int (*xsymm_iutcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xsymm_iltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xsymm_outcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xsymm_oltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - - int (*xhemm_iutcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xhemm_iltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xhemm_outcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xhemm_oltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - - int xgemm3m_p, xgemm3m_q, xgemm3m_r; - int xgemm3m_unroll_m, xgemm3m_unroll_n, xgemm3m_unroll_mn; - - int (*xgemm3m_kernel)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); - - int (*xgemm3m_incopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemm3m_incopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemm3m_incopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemm3m_itcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemm3m_itcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemm3m_itcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - - int (*xgemm3m_oncopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); - int (*xgemm3m_oncopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); - int (*xgemm3m_oncopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); - int (*xgemm3m_otcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); - int (*xgemm3m_otcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); - int (*xgemm3m_otcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); - - int (*xsymm3m_iucopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xsymm3m_ilcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xsymm3m_iucopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xsymm3m_ilcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xsymm3m_iucopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xsymm3m_ilcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - - int (*xsymm3m_oucopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - int (*xsymm3m_olcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - int (*xsymm3m_oucopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - int (*xsymm3m_olcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - int (*xsymm3m_oucopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - int (*xsymm3m_olcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - - int (*xhemm3m_iucopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xhemm3m_ilcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xhemm3m_iucopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xhemm3m_ilcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xhemm3m_iucopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xhemm3m_ilcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - - int (*xhemm3m_oucopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - int (*xhemm3m_olcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - int (*xhemm3m_oucopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - int (*xhemm3m_olcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - int (*xhemm3m_oucopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - int (*xhemm3m_olcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - - int (*xneg_tcopy) (BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, xdouble *, BLASLONG, blasint *, xdouble *); - -#endif - - - void (*init)(void); - - int snum_opt, dnum_opt, qnum_opt; - - int (*saxpby_k) (BLASLONG, float, float*, BLASLONG,float, float*, BLASLONG); - int (*daxpby_k) (BLASLONG, double, double*, BLASLONG,double, double*, BLASLONG); - int (*caxpby_k) (BLASLONG, float, float, float*, BLASLONG,float,float, float*, BLASLONG); - int (*zaxpby_k) (BLASLONG, double, double, double*, BLASLONG,double,double, double*, BLASLONG); - - int (*somatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); - int (*somatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); - int (*somatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); - int (*somatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); - - int (*domatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); - int (*domatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); - int (*domatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); - int (*domatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); - - int (*comatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); - int (*comatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); - int (*comatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); - int (*comatcopy_k_rt) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); - - int (*comatcopy_k_cnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); - int (*comatcopy_k_ctc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); - int (*comatcopy_k_rnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); - int (*comatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); - - int (*zomatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); - int (*zomatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); - int (*zomatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); - int (*zomatcopy_k_rt) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); - - int (*zomatcopy_k_cnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); - int (*zomatcopy_k_ctc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); - int (*zomatcopy_k_rnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); - int (*zomatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); - - int (*simatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG); - int (*simatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG); - int (*simatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG); - int (*simatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG); - - int (*dimatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG); - int (*dimatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG); - int (*dimatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG); - int (*dimatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG); - - int (*cimatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); - int (*cimatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); - int (*cimatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); - int (*cimatcopy_k_rt) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); - - int (*cimatcopy_k_cnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); - int (*cimatcopy_k_ctc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); - int (*cimatcopy_k_rnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); - int (*cimatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); - - int (*zimatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); - int (*zimatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); - int (*zimatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); - int (*zimatcopy_k_rt) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); - - int (*zimatcopy_k_cnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); - int (*zimatcopy_k_ctc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); - int (*zimatcopy_k_rnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); - int (*zimatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); - - int (*sgeadd_k) (BLASLONG, BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG); - int (*dgeadd_k) (BLASLONG, BLASLONG, double, double *, BLASLONG, double, double *, BLASLONG); - int (*cgeadd_k) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float, float, float *, BLASLONG); - int (*zgeadd_k) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double, double, double *, BLASLONG); - -} gotoblas_t; - -extern gotoblas_t *gotoblas; - -#define DTB_ENTRIES gotoblas -> dtb_entries -#define GEMM_OFFSET_A gotoblas -> offsetA -#define GEMM_OFFSET_B gotoblas -> offsetB -#define GEMM_ALIGN gotoblas -> align - -#define HAVE_EX_L2 gotoblas -> exclusive_cache - -#define SGEMM_P gotoblas -> sgemm_p -#define SGEMM_Q gotoblas -> sgemm_q -#define SGEMM_R gotoblas -> sgemm_r -#define SGEMM_UNROLL_M gotoblas -> sgemm_unroll_m -#define SGEMM_UNROLL_N gotoblas -> sgemm_unroll_n -#define SGEMM_UNROLL_MN gotoblas -> sgemm_unroll_mn - -#define DGEMM_P gotoblas -> dgemm_p -#define DGEMM_Q gotoblas -> dgemm_q -#define DGEMM_R gotoblas -> dgemm_r -#define DGEMM_UNROLL_M gotoblas -> dgemm_unroll_m -#define DGEMM_UNROLL_N gotoblas -> dgemm_unroll_n -#define DGEMM_UNROLL_MN gotoblas -> dgemm_unroll_mn - -#define QGEMM_P gotoblas -> qgemm_p -#define QGEMM_Q gotoblas -> qgemm_q -#define QGEMM_R gotoblas -> qgemm_r -#define QGEMM_UNROLL_M gotoblas -> qgemm_unroll_m -#define QGEMM_UNROLL_N gotoblas -> qgemm_unroll_n -#define QGEMM_UNROLL_MN gotoblas -> qgemm_unroll_mn - -#define CGEMM_P gotoblas -> cgemm_p -#define CGEMM_Q gotoblas -> cgemm_q -#define CGEMM_R gotoblas -> cgemm_r -#define CGEMM_UNROLL_M gotoblas -> cgemm_unroll_m -#define CGEMM_UNROLL_N gotoblas -> cgemm_unroll_n -#define CGEMM_UNROLL_MN gotoblas -> cgemm_unroll_mn - -#define ZGEMM_P gotoblas -> zgemm_p -#define ZGEMM_Q gotoblas -> zgemm_q -#define ZGEMM_R gotoblas -> zgemm_r -#define ZGEMM_UNROLL_M gotoblas -> zgemm_unroll_m -#define ZGEMM_UNROLL_N gotoblas -> zgemm_unroll_n -#define ZGEMM_UNROLL_MN gotoblas -> zgemm_unroll_mn - -#define XGEMM_P gotoblas -> xgemm_p -#define XGEMM_Q gotoblas -> xgemm_q -#define XGEMM_R gotoblas -> xgemm_r -#define XGEMM_UNROLL_M gotoblas -> xgemm_unroll_m -#define XGEMM_UNROLL_N gotoblas -> xgemm_unroll_n -#define XGEMM_UNROLL_MN gotoblas -> xgemm_unroll_mn - -#define CGEMM3M_P gotoblas -> cgemm3m_p -#define CGEMM3M_Q gotoblas -> cgemm3m_q -#define CGEMM3M_R gotoblas -> cgemm3m_r -#define CGEMM3M_UNROLL_M gotoblas -> cgemm3m_unroll_m -#define CGEMM3M_UNROLL_N gotoblas -> cgemm3m_unroll_n -#define CGEMM3M_UNROLL_MN gotoblas -> cgemm3m_unroll_mn - -#define ZGEMM3M_P gotoblas -> zgemm3m_p -#define ZGEMM3M_Q gotoblas -> zgemm3m_q -#define ZGEMM3M_R gotoblas -> zgemm3m_r -#define ZGEMM3M_UNROLL_M gotoblas -> zgemm3m_unroll_m -#define ZGEMM3M_UNROLL_N gotoblas -> zgemm3m_unroll_n -#define ZGEMM3M_UNROLL_MN gotoblas -> zgemm3m_unroll_mn - -#define XGEMM3M_P gotoblas -> xgemm3m_p -#define XGEMM3M_Q gotoblas -> xgemm3m_q -#define XGEMM3M_R gotoblas -> xgemm3m_r -#define XGEMM3M_UNROLL_M gotoblas -> xgemm3m_unroll_m -#define XGEMM3M_UNROLL_N gotoblas -> xgemm3m_unroll_n -#define XGEMM3M_UNROLL_MN gotoblas -> xgemm3m_unroll_mn - -#else - -#define DTB_ENTRIES DTB_DEFAULT_ENTRIES - -#define GEMM_OFFSET_A GEMM_DEFAULT_OFFSET_A -#define GEMM_OFFSET_B GEMM_DEFAULT_OFFSET_B -#define GEMM_ALIGN GEMM_DEFAULT_ALIGN - -#ifdef HAVE_EXCLUSIVE_CACHE -#define HAVE_EX_L2 1 -#else -#define HAVE_EX_L2 0 -#endif - -#define SGEMM_P SGEMM_DEFAULT_P -#define SGEMM_Q SGEMM_DEFAULT_Q -#define SGEMM_R SGEMM_DEFAULT_R -#define SGEMM_UNROLL_M SGEMM_DEFAULT_UNROLL_M -#define SGEMM_UNROLL_N SGEMM_DEFAULT_UNROLL_N -#ifdef SGEMM_DEFAULT_UNROLL_MN -#define SGEMM_UNROLL_MN SGEMM_DEFAULT_UNROLL_MN -#else -#define SGEMM_UNROLL_MN MAX((SGEMM_UNROLL_M), (SGEMM_UNROLL_N)) -#endif - -#define DGEMM_P DGEMM_DEFAULT_P -#define DGEMM_Q DGEMM_DEFAULT_Q -#define DGEMM_R DGEMM_DEFAULT_R -#define DGEMM_UNROLL_M DGEMM_DEFAULT_UNROLL_M -#define DGEMM_UNROLL_N DGEMM_DEFAULT_UNROLL_N -#ifdef DGEMM_DEFAULT_UNROLL_MN -#define DGEMM_UNROLL_MN DGEMM_DEFAULT_UNROLL_MN -#else -#define DGEMM_UNROLL_MN MAX((DGEMM_UNROLL_M), (DGEMM_UNROLL_N)) -#endif - -#define QGEMM_P QGEMM_DEFAULT_P -#define QGEMM_Q QGEMM_DEFAULT_Q -#define QGEMM_R QGEMM_DEFAULT_R -#define QGEMM_UNROLL_M QGEMM_DEFAULT_UNROLL_M -#define QGEMM_UNROLL_N QGEMM_DEFAULT_UNROLL_N -#define QGEMM_UNROLL_MN MAX((QGEMM_UNROLL_M), (QGEMM_UNROLL_N)) - -#define CGEMM_P CGEMM_DEFAULT_P -#define CGEMM_Q CGEMM_DEFAULT_Q -#define CGEMM_R CGEMM_DEFAULT_R -#define CGEMM_UNROLL_M CGEMM_DEFAULT_UNROLL_M -#define CGEMM_UNROLL_N CGEMM_DEFAULT_UNROLL_N -#ifdef CGEMM_DEFAULT_UNROLL_MN -#define CGEMM_UNROLL_MN CGEMM_DEFAULT_UNROLL_MN -#else -#define CGEMM_UNROLL_MN MAX((CGEMM_UNROLL_M), (CGEMM_UNROLL_N)) -#endif - -#define ZGEMM_P ZGEMM_DEFAULT_P -#define ZGEMM_Q ZGEMM_DEFAULT_Q -#define ZGEMM_R ZGEMM_DEFAULT_R -#define ZGEMM_UNROLL_M ZGEMM_DEFAULT_UNROLL_M -#define ZGEMM_UNROLL_N ZGEMM_DEFAULT_UNROLL_N -#ifdef ZGEMM_DEFAULT_UNROLL_MN -#define ZGEMM_UNROLL_MN ZGEMM_DEFAULT_UNROLL_MN -#else -#define ZGEMM_UNROLL_MN MAX((ZGEMM_UNROLL_M), (ZGEMM_UNROLL_N)) -#endif - -#define XGEMM_P XGEMM_DEFAULT_P -#define XGEMM_Q XGEMM_DEFAULT_Q -#define XGEMM_R XGEMM_DEFAULT_R -#define XGEMM_UNROLL_M XGEMM_DEFAULT_UNROLL_M -#define XGEMM_UNROLL_N XGEMM_DEFAULT_UNROLL_N -#define XGEMM_UNROLL_MN MAX((XGEMM_UNROLL_M), (XGEMM_UNROLL_N)) - -#ifdef CGEMM3M_DEFAULT_UNROLL_N - -#define CGEMM3M_P CGEMM3M_DEFAULT_P -#define CGEMM3M_Q CGEMM3M_DEFAULT_Q -#define CGEMM3M_R CGEMM3M_DEFAULT_R -#define CGEMM3M_UNROLL_M CGEMM3M_DEFAULT_UNROLL_M -#define CGEMM3M_UNROLL_N CGEMM3M_DEFAULT_UNROLL_N -#define CGEMM3M_UNROLL_MN MAX((CGEMM3M_UNROLL_M), (CGEMM3M_UNROLL_N)) - -#else - -#define CGEMM3M_P SGEMM_DEFAULT_P -#define CGEMM3M_Q SGEMM_DEFAULT_Q -#define CGEMM3M_R SGEMM_DEFAULT_R -#define CGEMM3M_UNROLL_M SGEMM_DEFAULT_UNROLL_M -#define CGEMM3M_UNROLL_N SGEMM_DEFAULT_UNROLL_N -#define CGEMM3M_UNROLL_MN MAX((CGEMM_UNROLL_M), (CGEMM_UNROLL_N)) - -#endif - - -#ifdef ZGEMM3M_DEFAULT_UNROLL_N - -#define ZGEMM3M_P ZGEMM3M_DEFAULT_P -#define ZGEMM3M_Q ZGEMM3M_DEFAULT_Q -#define ZGEMM3M_R ZGEMM3M_DEFAULT_R -#define ZGEMM3M_UNROLL_M ZGEMM3M_DEFAULT_UNROLL_M -#define ZGEMM3M_UNROLL_N ZGEMM3M_DEFAULT_UNROLL_N -#define ZGEMM3M_UNROLL_MN MAX((ZGEMM_UNROLL_M), (ZGEMM_UNROLL_N)) - -#else - -#define ZGEMM3M_P DGEMM_DEFAULT_P -#define ZGEMM3M_Q DGEMM_DEFAULT_Q -#define ZGEMM3M_R DGEMM_DEFAULT_R -#define ZGEMM3M_UNROLL_M DGEMM_DEFAULT_UNROLL_M -#define ZGEMM3M_UNROLL_N DGEMM_DEFAULT_UNROLL_N -#define ZGEMM3M_UNROLL_MN MAX((ZGEMM_UNROLL_M), (ZGEMM_UNROLL_N)) - -#endif - -#define XGEMM3M_P QGEMM_DEFAULT_P -#define XGEMM3M_Q QGEMM_DEFAULT_Q -#define XGEMM3M_R QGEMM_DEFAULT_R -#define XGEMM3M_UNROLL_M QGEMM_DEFAULT_UNROLL_M -#define XGEMM3M_UNROLL_N QGEMM_DEFAULT_UNROLL_N -#define XGEMM3M_UNROLL_MN MAX((QGEMM_UNROLL_M), (QGEMM_UNROLL_N)) - - -#endif -#endif - -#ifndef COMPLEX -#if defined(XDOUBLE) -#define GEMM_P QGEMM_P -#define GEMM_Q QGEMM_Q -#define GEMM_R QGEMM_R -#define GEMM_UNROLL_M QGEMM_UNROLL_M -#define GEMM_UNROLL_N QGEMM_UNROLL_N -#define GEMM_UNROLL_MN QGEMM_UNROLL_MN -#define GEMM_DEFAULT_P QGEMM_DEFAULT_P -#define GEMM_DEFAULT_Q QGEMM_DEFAULT_Q -#define GEMM_DEFAULT_R QGEMM_DEFAULT_R -#define GEMM_DEFAULT_UNROLL_M QGEMM_DEFAULT_UNROLL_M -#define GEMM_DEFAULT_UNROLL_N QGEMM_DEFAULT_UNROLL_N -#elif defined(DOUBLE) -#define GEMM_P DGEMM_P -#define GEMM_Q DGEMM_Q -#define GEMM_R DGEMM_R -#define GEMM_UNROLL_M DGEMM_UNROLL_M -#define GEMM_UNROLL_N DGEMM_UNROLL_N -#define GEMM_UNROLL_MN DGEMM_UNROLL_MN -#define GEMM_DEFAULT_P DGEMM_DEFAULT_P -#define GEMM_DEFAULT_Q DGEMM_DEFAULT_Q -#define GEMM_DEFAULT_R DGEMM_DEFAULT_R -#define GEMM_DEFAULT_UNROLL_M DGEMM_DEFAULT_UNROLL_M -#define GEMM_DEFAULT_UNROLL_N DGEMM_DEFAULT_UNROLL_N -#else -#define GEMM_P SGEMM_P -#define GEMM_Q SGEMM_Q -#define GEMM_R SGEMM_R -#define GEMM_UNROLL_M SGEMM_UNROLL_M -#define GEMM_UNROLL_N SGEMM_UNROLL_N -#define GEMM_UNROLL_MN SGEMM_UNROLL_MN -#define GEMM_DEFAULT_P SGEMM_DEFAULT_P -#define GEMM_DEFAULT_Q SGEMM_DEFAULT_Q -#define GEMM_DEFAULT_R SGEMM_DEFAULT_R -#define GEMM_DEFAULT_UNROLL_M SGEMM_DEFAULT_UNROLL_M -#define GEMM_DEFAULT_UNROLL_N SGEMM_DEFAULT_UNROLL_N -#endif -#else -#if defined(XDOUBLE) -#define GEMM_P XGEMM_P -#define GEMM_Q XGEMM_Q -#define GEMM_R XGEMM_R -#define GEMM_UNROLL_M XGEMM_UNROLL_M -#define GEMM_UNROLL_N XGEMM_UNROLL_N -#define GEMM_UNROLL_MN XGEMM_UNROLL_MN -#define GEMM_DEFAULT_P XGEMM_DEFAULT_P -#define GEMM_DEFAULT_Q XGEMM_DEFAULT_Q -#define GEMM_DEFAULT_R XGEMM_DEFAULT_R -#define GEMM_DEFAULT_UNROLL_M XGEMM_DEFAULT_UNROLL_M -#define GEMM_DEFAULT_UNROLL_N XGEMM_DEFAULT_UNROLL_N -#elif defined(DOUBLE) -#define GEMM_P ZGEMM_P -#define GEMM_Q ZGEMM_Q -#define GEMM_R ZGEMM_R -#define GEMM_UNROLL_M ZGEMM_UNROLL_M -#define GEMM_UNROLL_N ZGEMM_UNROLL_N -#define GEMM_UNROLL_MN ZGEMM_UNROLL_MN -#define GEMM_DEFAULT_P ZGEMM_DEFAULT_P -#define GEMM_DEFAULT_Q ZGEMM_DEFAULT_Q -#define GEMM_DEFAULT_R ZGEMM_DEFAULT_R -#define GEMM_DEFAULT_UNROLL_M ZGEMM_DEFAULT_UNROLL_M -#define GEMM_DEFAULT_UNROLL_N ZGEMM_DEFAULT_UNROLL_N -#else -#define GEMM_P CGEMM_P -#define GEMM_Q CGEMM_Q -#define GEMM_R CGEMM_R -#define GEMM_UNROLL_M CGEMM_UNROLL_M -#define GEMM_UNROLL_N CGEMM_UNROLL_N -#define GEMM_UNROLL_MN CGEMM_UNROLL_MN -#define GEMM_DEFAULT_P CGEMM_DEFAULT_P -#define GEMM_DEFAULT_Q CGEMM_DEFAULT_Q -#define GEMM_DEFAULT_R CGEMM_DEFAULT_R -#define GEMM_DEFAULT_UNROLL_M CGEMM_DEFAULT_UNROLL_M -#define GEMM_DEFAULT_UNROLL_N CGEMM_DEFAULT_UNROLL_N -#endif -#endif - -#ifdef XDOUBLE -#define GEMM3M_UNROLL_M XGEMM3M_UNROLL_M -#define GEMM3M_UNROLL_N XGEMM3M_UNROLL_N -#elif defined(DOUBLE) -#define GEMM3M_UNROLL_M ZGEMM3M_UNROLL_M -#define GEMM3M_UNROLL_N ZGEMM3M_UNROLL_N -#else -#define GEMM3M_UNROLL_M CGEMM3M_UNROLL_M -#define GEMM3M_UNROLL_N CGEMM3M_UNROLL_N -#endif - - -#ifndef QGEMM_DEFAULT_UNROLL_M -#define QGEMM_DEFAULT_UNROLL_M 2 -#endif - -#ifndef QGEMM_DEFAULT_UNROLL_N -#define QGEMM_DEFAULT_UNROLL_N 2 -#endif - -#ifndef XGEMM_DEFAULT_UNROLL_M -#define XGEMM_DEFAULT_UNROLL_M 2 -#endif - -#ifndef XGEMM_DEFAULT_UNROLL_N -#define XGEMM_DEFAULT_UNROLL_N 2 -#endif - -#ifndef GEMM_THREAD -#define GEMM_THREAD gemm_thread_n -#endif - -#ifndef SGEMM_DEFAULT_R -#define SGEMM_DEFAULT_R (((BUFFER_SIZE - ((SGEMM_DEFAULT_P * SGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SGEMM_DEFAULT_Q * 4) - 15) & ~15UL) -#endif - -#ifndef DGEMM_DEFAULT_R -#define DGEMM_DEFAULT_R (((BUFFER_SIZE - ((DGEMM_DEFAULT_P * DGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (DGEMM_DEFAULT_Q * 8) - 15) & ~15UL) -#endif - -#ifndef QGEMM_DEFAULT_R -#define QGEMM_DEFAULT_R (((BUFFER_SIZE - ((QGEMM_DEFAULT_P * QGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (QGEMM_DEFAULT_Q * 16) - 15) & ~15UL) -#endif - -#ifndef CGEMM_DEFAULT_R -#define CGEMM_DEFAULT_R (((BUFFER_SIZE - ((CGEMM_DEFAULT_P * CGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (CGEMM_DEFAULT_Q * 8) - 15) & ~15UL) -#endif - -#ifndef ZGEMM_DEFAULT_R -#define ZGEMM_DEFAULT_R (((BUFFER_SIZE - ((ZGEMM_DEFAULT_P * ZGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (ZGEMM_DEFAULT_Q * 16) - 15) & ~15UL) -#endif - -#ifndef XGEMM_DEFAULT_R -#define XGEMM_DEFAULT_R (((BUFFER_SIZE - ((XGEMM_DEFAULT_P * XGEMM_DEFAULT_Q * 32 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (XGEMM_DEFAULT_Q * 32) - 15) & ~15UL) -#endif - -#ifndef SNUMOPT -#define SNUMOPT 2 -#endif - -#ifndef DNUMOPT -#define DNUMOPT 2 -#endif - -#ifndef QNUMOPT -#define QNUMOPT 1 -#endif - -#ifndef GEMM3M_P -#ifdef XDOUBLE -#define GEMM3M_P XGEMM3M_P -#elif defined(DOUBLE) -#define GEMM3M_P ZGEMM3M_P -#else -#define GEMM3M_P CGEMM3M_P -#endif -#endif - -#ifndef GEMM3M_Q -#ifdef XDOUBLE -#define GEMM3M_Q XGEMM3M_Q -#elif defined(DOUBLE) -#define GEMM3M_Q ZGEMM3M_Q -#else -#define GEMM3M_Q CGEMM3M_Q -#endif -#endif - -#ifndef GEMM3M_R -#ifdef XDOUBLE -#define GEMM3M_R XGEMM3M_R -#elif defined(DOUBLE) -#define GEMM3M_R ZGEMM3M_R -#else -#define GEMM3M_R CGEMM3M_R -#endif -#endif - - -#endif diff --git a/kernel/generic/gemm_beta.c b/kernel/generic/gemm_beta.c index fa9d7680d..ccb772cc7 100644 --- a/kernel/generic/gemm_beta.c +++ b/kernel/generic/gemm_beta.c @@ -39,7 +39,7 @@ #include "common.h" int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, - FLOAT *dummy2, BLASLONG dummy3, FLOAT *dummy4, BLASLONG dummy5, + IFLOAT *dummy2, BLASLONG dummy3, IFLOAT *dummy4, BLASLONG dummy5, FLOAT *c, BLASLONG ldc){ diff --git a/kernel/generic/gemm_ncopy_16.c b/kernel/generic/gemm_ncopy_16.c index 5f91d0dbe..d3ab46472 100644 --- a/kernel/generic/gemm_ncopy_16.c +++ b/kernel/generic/gemm_ncopy_16.c @@ -39,24 +39,24 @@ #include #include "common.h" -int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ BLASLONG i, j; - FLOAT *aoffset; - FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; - FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; - FLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12; - FLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16; + IFLOAT *aoffset; + IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; + IFLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12; + IFLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16; - FLOAT *boffset; - FLOAT ctemp01, ctemp02, ctemp03, ctemp04; - FLOAT ctemp05, ctemp06, ctemp07, ctemp08; - FLOAT ctemp09, ctemp10, ctemp11, ctemp12; - FLOAT ctemp13, ctemp14, ctemp15, ctemp16; - FLOAT ctemp17, ctemp18, ctemp19, ctemp20; - FLOAT ctemp21, ctemp22, ctemp23, ctemp24; - FLOAT ctemp25, ctemp26, ctemp27, ctemp28; - FLOAT ctemp29, ctemp30, ctemp31, ctemp32; + IFLOAT *boffset; + IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; + IFLOAT ctemp05, ctemp06, ctemp07, ctemp08; + IFLOAT ctemp09, ctemp10, ctemp11, ctemp12; + IFLOAT ctemp13, ctemp14, ctemp15, ctemp16; + IFLOAT ctemp17, ctemp18, ctemp19, ctemp20; + IFLOAT ctemp21, ctemp22, ctemp23, ctemp24; + IFLOAT ctemp25, ctemp26, ctemp27, ctemp28; + IFLOAT ctemp29, ctemp30, ctemp31, ctemp32; aoffset = a; boffset = b; diff --git a/kernel/generic/gemm_ncopy_2.c b/kernel/generic/gemm_ncopy_2.c index b728c713f..415860f81 100644 --- a/kernel/generic/gemm_ncopy_2.c +++ b/kernel/generic/gemm_ncopy_2.c @@ -39,10 +39,10 @@ #include #include "common.h" -int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ BLASLONG i, j; - FLOAT *a_offset, *a_offset1, *a_offset2; - FLOAT *b_offset; + IFLOAT *a_offset, *a_offset1, *a_offset2; + IFLOAT *b_offset; a_offset = a; b_offset = b; diff --git a/kernel/generic/gemm_ncopy_8.c b/kernel/generic/gemm_ncopy_8.c index a49a778e6..aaf9c8917 100644 --- a/kernel/generic/gemm_ncopy_8.c +++ b/kernel/generic/gemm_ncopy_8.c @@ -39,30 +39,30 @@ #include #include "common.h" -int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ BLASLONG i, j; - FLOAT *aoffset; - FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; - FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; + IFLOAT *aoffset; + IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; - FLOAT *boffset; - FLOAT ctemp01, ctemp02, ctemp03, ctemp04; - FLOAT ctemp05, ctemp06, ctemp07, ctemp08; - FLOAT ctemp09, ctemp10, ctemp11, ctemp12; - FLOAT ctemp13, ctemp14, ctemp15, ctemp16; - FLOAT ctemp17, ctemp18, ctemp19, ctemp20; - FLOAT ctemp21, ctemp22, ctemp23, ctemp24; - FLOAT ctemp25, ctemp26, ctemp27, ctemp28; - FLOAT ctemp29, ctemp30, ctemp31, ctemp32; - FLOAT ctemp33, ctemp34, ctemp35, ctemp36; - FLOAT ctemp37, ctemp38, ctemp39, ctemp40; - FLOAT ctemp41, ctemp42, ctemp43, ctemp44; - FLOAT ctemp45, ctemp46, ctemp47, ctemp48; - FLOAT ctemp49, ctemp50, ctemp51, ctemp52; - FLOAT ctemp53, ctemp54, ctemp55, ctemp56; - FLOAT ctemp57, ctemp58, ctemp59, ctemp60; - FLOAT ctemp61, ctemp62, ctemp63, ctemp64; + IFLOAT *boffset; + IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; + IFLOAT ctemp05, ctemp06, ctemp07, ctemp08; + IFLOAT ctemp09, ctemp10, ctemp11, ctemp12; + IFLOAT ctemp13, ctemp14, ctemp15, ctemp16; + IFLOAT ctemp17, ctemp18, ctemp19, ctemp20; + IFLOAT ctemp21, ctemp22, ctemp23, ctemp24; + IFLOAT ctemp25, ctemp26, ctemp27, ctemp28; + IFLOAT ctemp29, ctemp30, ctemp31, ctemp32; + IFLOAT ctemp33, ctemp34, ctemp35, ctemp36; + IFLOAT ctemp37, ctemp38, ctemp39, ctemp40; + IFLOAT ctemp41, ctemp42, ctemp43, ctemp44; + IFLOAT ctemp45, ctemp46, ctemp47, ctemp48; + IFLOAT ctemp49, ctemp50, ctemp51, ctemp52; + IFLOAT ctemp53, ctemp54, ctemp55, ctemp56; + IFLOAT ctemp57, ctemp58, ctemp59, ctemp60; + IFLOAT ctemp61, ctemp62, ctemp63, ctemp64; aoffset = a; diff --git a/kernel/generic/gemm_tcopy_16.c b/kernel/generic/gemm_tcopy_16.c index 56268ebf2..14252599a 100644 --- a/kernel/generic/gemm_tcopy_16.c +++ b/kernel/generic/gemm_tcopy_16.c @@ -39,22 +39,22 @@ #include #include "common.h" -int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ BLASLONG i, j; - FLOAT *aoffset; - FLOAT *aoffset1, *aoffset2; - FLOAT *boffset; + IFLOAT *aoffset; + IFLOAT *aoffset1, *aoffset2; + IFLOAT *boffset; - FLOAT ctemp01, ctemp02, ctemp03, ctemp04; - FLOAT ctemp05, ctemp06, ctemp07, ctemp08; - FLOAT ctemp09, ctemp10, ctemp11, ctemp12; - FLOAT ctemp13, ctemp14, ctemp15, ctemp16; - FLOAT ctemp17, ctemp18, ctemp19, ctemp20; - FLOAT ctemp21, ctemp22, ctemp23, ctemp24; - FLOAT ctemp25, ctemp26, ctemp27, ctemp28; - FLOAT ctemp29, ctemp30, ctemp31, ctemp32; + IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; + IFLOAT ctemp05, ctemp06, ctemp07, ctemp08; + IFLOAT ctemp09, ctemp10, ctemp11, ctemp12; + IFLOAT ctemp13, ctemp14, ctemp15, ctemp16; + IFLOAT ctemp17, ctemp18, ctemp19, ctemp20; + IFLOAT ctemp21, ctemp22, ctemp23, ctemp24; + IFLOAT ctemp25, ctemp26, ctemp27, ctemp28; + IFLOAT ctemp29, ctemp30, ctemp31, ctemp32; aoffset = a; boffset = b; diff --git a/kernel/generic/gemm_tcopy_2.c b/kernel/generic/gemm_tcopy_2.c index 5695b13c2..b4aa4de57 100644 --- a/kernel/generic/gemm_tcopy_2.c +++ b/kernel/generic/gemm_tcopy_2.c @@ -39,11 +39,11 @@ #include #include "common.h" -int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ BLASLONG i, j; - FLOAT *a_offset, *a_offset1, *a_offset2; - FLOAT *b_offset, *b_offset1, *b_offset2; + IFLOAT *a_offset, *a_offset1, *a_offset2; + IFLOAT *b_offset, *b_offset1, *b_offset2; a_offset = a; b_offset = b; diff --git a/kernel/generic/gemm_tcopy_8.c b/kernel/generic/gemm_tcopy_8.c index b28f3d219..3e8a839db 100644 --- a/kernel/generic/gemm_tcopy_8.c +++ b/kernel/generic/gemm_tcopy_8.c @@ -39,32 +39,32 @@ #include #include "common.h" -int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ BLASLONG i, j; - FLOAT *aoffset; - FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; - FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; + IFLOAT *aoffset; + IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; - FLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4; + IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4; - FLOAT ctemp01, ctemp02, ctemp03, ctemp04; - FLOAT ctemp05, ctemp06, ctemp07, ctemp08; - FLOAT ctemp09, ctemp10, ctemp11, ctemp12; - FLOAT ctemp13, ctemp14, ctemp15, ctemp16; - FLOAT ctemp17, ctemp18, ctemp19, ctemp20; - FLOAT ctemp21, ctemp22, ctemp23, ctemp24; - FLOAT ctemp25, ctemp26, ctemp27, ctemp28; - FLOAT ctemp29, ctemp30, ctemp31, ctemp32; - FLOAT ctemp33, ctemp34, ctemp35, ctemp36; - FLOAT ctemp37, ctemp38, ctemp39, ctemp40; - FLOAT ctemp41, ctemp42, ctemp43, ctemp44; - FLOAT ctemp45, ctemp46, ctemp47, ctemp48; - FLOAT ctemp49, ctemp50, ctemp51, ctemp52; - FLOAT ctemp53, ctemp54, ctemp55, ctemp56; - FLOAT ctemp57, ctemp58, ctemp59, ctemp60; - FLOAT ctemp61, ctemp62, ctemp63, ctemp64; + IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; + IFLOAT ctemp05, ctemp06, ctemp07, ctemp08; + IFLOAT ctemp09, ctemp10, ctemp11, ctemp12; + IFLOAT ctemp13, ctemp14, ctemp15, ctemp16; + IFLOAT ctemp17, ctemp18, ctemp19, ctemp20; + IFLOAT ctemp21, ctemp22, ctemp23, ctemp24; + IFLOAT ctemp25, ctemp26, ctemp27, ctemp28; + IFLOAT ctemp29, ctemp30, ctemp31, ctemp32; + IFLOAT ctemp33, ctemp34, ctemp35, ctemp36; + IFLOAT ctemp37, ctemp38, ctemp39, ctemp40; + IFLOAT ctemp41, ctemp42, ctemp43, ctemp44; + IFLOAT ctemp45, ctemp46, ctemp47, ctemp48; + IFLOAT ctemp49, ctemp50, ctemp51, ctemp52; + IFLOAT ctemp53, ctemp54, ctemp55, ctemp56; + IFLOAT ctemp57, ctemp58, ctemp59, ctemp60; + IFLOAT ctemp61, ctemp62, ctemp63, ctemp64; aoffset = a; boffset = b; diff --git a/kernel/generic/gemmkernel_2x2.c b/kernel/generic/gemmkernel_2x2.c index 01f1c67b5..cc7bb8e48 100644 --- a/kernel/generic/gemmkernel_2x2.c +++ b/kernel/generic/gemmkernel_2x2.c @@ -1,13 +1,32 @@ #include "common.h" -int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc +#if defined(HALF) && defined(HALFCONVERSION) +static float +bfloat16tof32 (bfloat16 f16) +{ + float result = 0; + unsigned short* q = (unsigned short*)(&result); +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + q[0] = f16; +#else + q[1] = f16; +#endif + return result; +} +#define BF16TOF32(x) (bfloat16tof32(x)) +#else +#define BF16TOF32(x) x +#endif +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,IFLOAT* ba,IFLOAT* bb,FLOAT* C,BLASLONG ldc #ifdef TRMMKERNEL ,BLASLONG offset #endif ) { BLASLONG i,j,k; - FLOAT *C0,*C1,*ptrba,*ptrbb; - FLOAT res0,res1,res2,res3,load0,load1,load2,load3,load4,load5,load6,load7; + FLOAT *C0,*C1; + IFLOAT *ptrba,*ptrbb; + FLOAT res0,res1,res2,res3; + IFLOAT load0,load1,load2,load3,load4,load5,load6,load7; for (j=0; j + +typedef unsigned char vec_t __attribute__ ((vector_size (16))); +typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); +typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); + +#ifdef TRMMKERNEL +#define SAVE_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] = result[3] * alpha; \ + rowC = (v4sf_t *) &CO[1*ldc+J]; \ + rowC[0] = result[2] * alpha; \ + rowC = (v4sf_t *) &CO[2*ldc+J]; \ + rowC[0] = result[1] * alpha; \ + rowC = (v4sf_t *) &CO[3*ldc+J]; \ + rowC[0] = result[0] * alpha; +#define SAVE_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[4* ldc+J]; \ + rowC[0] = result[3] * alpha; \ + rowC = (v4sf_t *) &CO[5*ldc+J]; \ + rowC[0] = result[2] * alpha; \ + rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC[0] = result[1] * alpha; \ + rowC = (v4sf_t *) &CO[7*ldc+J]; \ + rowC[0] = result[0] * alpha; +#define SAVE2x4_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] = result[3] * alpha; \ + rowC = (v4sf_t *) &CO[1* ldc+J]; \ + rowC[0] = result[2] * alpha; +#else +#define SAVE_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] += result[3] * alpha; \ + rowC = (v4sf_t *) &CO[1*ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v4sf_t *) &CO[2*ldc+J]; \ + rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[3*ldc+J]; \ + rowC[0] += result[0] * alpha; +#define SAVE_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[4* ldc+J]; \ + rowC[0] += result[3] * alpha; \ + rowC = (v4sf_t *) &CO[5*ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[7*ldc+J]; \ + rowC[0] += result[0] * alpha; +#define SAVE2x4_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] += result[3] * alpha; \ + rowC = (v4sf_t *) &CO[1* ldc+J]; \ + rowC[0] += result[2] * alpha; +#endif + +#define SET_ACC_ZERO4() \ + __builtin_mma_xxsetaccz (&acc0); \ + __builtin_mma_xxsetaccz (&acc1); \ + __builtin_mma_xxsetaccz (&acc2); \ + __builtin_mma_xxsetaccz (&acc3); + +#define SET_ACC_ZERO8() \ + __builtin_mma_xxsetaccz (&acc0); \ + __builtin_mma_xxsetaccz (&acc1); \ + __builtin_mma_xxsetaccz (&acc2); \ + __builtin_mma_xxsetaccz (&acc3); \ + __builtin_mma_xxsetaccz (&acc4); \ + __builtin_mma_xxsetaccz (&acc5); \ + __builtin_mma_xxsetaccz (&acc6); \ + __builtin_mma_xxsetaccz (&acc7); + +#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +#define REFRESH_TEMP_BK(x, y) \ + temp = k - off; +#elif defined(LEFT) +#define REFRESH_TEMP_BK(x, y) \ + temp = off + x; +#else +#define REFRESH_TEMP_BK(x, y) \ + temp = off + y; +#endif +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +#define REFRESH_POINTERS(x, y) \ + BO = B; \ + REFRESH_TEMP_BK(x, y) +#else +#define REFRESH_POINTERS(x, y) \ + AO += off * x; \ + BO = B + off * y; \ + REFRESH_TEMP_BK(x, y) +#endif + +#ifdef LEFT +#define REFRESH_OFF(x) \ + off += x; +#else +#define REFRESH_OFF(x) +#endif + +#ifdef LEFT +#define UPDATE_TEMP(x, y) \ + temp -= x; +#else +#define UPDATE_TEMP(x, y) \ + temp -= y; +#endif + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +#define REFRESH_TMP_AFTER_SAVE(x, y) \ + temp = k - off; \ + UPDATE_TEMP(x, y) \ + AO += temp * x; \ + BO += temp * y; +#else +#define REFRESH_TMP_AFTER_SAVE(x, y) +#endif + +#define REFRESH_AFTER_SAVE(x,y) \ + REFRESH_TMP_AFTER_SAVE(x, y) \ + REFRESH_OFF(x) +/************************************************************************************* +* GEMM Kernel +*************************************************************************************/ +int +CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, + FLOAT * C, BLASLONG ldc +#ifdef TRMMKERNEL + , BLASLONG offset +#endif + ) +{ + BLASLONG N = n; + BLASLONG i1; +#if defined(TRMMKERNEL) + BLASLONG off; +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#endif + v4sf_t valpha = { alpha, alpha }; + N = n >> 2; + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i, j, temp; + FLOAT *CO; + FLOAT *AO; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + CO = C; + C += ldc << 2; + AO = A; + PREFETCH1 (A, 128); + PREFETCH1 (A, 256); + i = m >> 4; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (16, 4); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + BLASLONG l = 0; + PREFETCH1 (CO, 0); + PREFETCH1 (CO + ldc, 0); + PREFETCH1 (CO + ldc + ldc, 0); + PREFETCH1 (CO + ldc + ldc + ldc, 0); + PREFETCH1 (CO, 128); + PREFETCH1 (CO + ldc, 128); + PREFETCH1 (CO + ldc + ldc, 128); + PREFETCH1 (CO + ldc + ldc + ldc, 128); + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + SET_ACC_ZERO8 (); + for (l = 0; l < temp; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 4]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & BO[l << 2]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); + __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); + __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]); + __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]); + __builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]); + __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]); + __builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc2, 4); + SAVE_ACC (&acc1, 2); + SAVE_ACC (&acc3, 6); + SAVE_ACC (&acc4, 8); + SAVE_ACC (&acc6, 12); + SAVE_ACC (&acc5, 10); + SAVE_ACC (&acc7, 14); + AO += temp << 4; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (16, 4) +#endif + CO += 16; + } + i = (m & 15) >> 3; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 4); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3; + SET_ACC_ZERO4 (); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 3]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & BO[l << 2]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); + __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); + __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc2, 4); + SAVE_ACC (&acc1, 2); + SAVE_ACC (&acc3, 6); + CO += 8; + AO += temp << 3; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 4) +#endif + } + i = (m & 7) >> 2; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 4); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 2]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & BO[l << 2]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc1, 2); + CO += 4; + AO += temp << 2; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 4) +#endif + } + i = (m & 3) >> 1; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 4); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0; + __builtin_mma_xxsetaccz (&acc0); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 1]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & BO[l << 2]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + } + SAVE_ACC (&acc0, 0); + CO += 2; + AO += temp << 1; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 4) +#endif + } + i = (m & 1) >> 0; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 4); +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0 }; + v4sf_t t1 = { 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowA = { AO[l], AO[l] }; + v4sf_t rowB = { BO[l << 2], BO[(l << 2) + 1] }; + v4sf_t rowB1 = { BO[(l << 2) + 2], BO[(l << 2) + 3] }; + t += rowA * rowB; + t1 += rowA * rowB1; + } + t = t * valpha; + t1 = t1 * valpha; +#if defined(TRMMKERNEL) + CO[0 * ldc] = t[0]; + CO[1 * ldc] = t[1]; + CO[2 * ldc] = t1[0]; + CO[3 * ldc] = t1[1]; +#else + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; + CO[2 * ldc] += t1[0]; + CO[3 * ldc] += t1[1]; +#endif + CO += 1; + AO += temp; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 4) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 4; // number of values in A +#endif + B += k << 2; + } + N = (n & 3) >> 1; + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i, j, temp; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + FLOAT *CO; + FLOAT *AO; + CO = C; + C += ldc << 1; + AO = A; + i = m >> 4; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (16, 2); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + SET_ACC_ZERO8 (); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + FLOAT t[4] = { 0, 0, 0, 0 }; + t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & t[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + vec_t *rowA = (vec_t *) & AO[l << 4]; + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); + __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); + __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]); + __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]); + __builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]); + __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]); + __builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]); + } + SAVE2x4_ACC (&acc0, 0); + SAVE2x4_ACC (&acc1, 2); + SAVE2x4_ACC (&acc2, 4); + SAVE2x4_ACC (&acc3, 6); + SAVE2x4_ACC (&acc4, 8); + SAVE2x4_ACC (&acc5, 10); + SAVE2x4_ACC (&acc6, 12); + SAVE2x4_ACC (&acc7, 14); + CO += 16; + AO += temp << 4; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (16, 2) +#endif + } + i = (m & 15) >> 3; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 2); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3; + SET_ACC_ZERO4 (); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + FLOAT t[4] = { 0, 0, 0, 0 }; + t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & t[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + vec_t *rowA = (vec_t *) & AO[l << 3]; + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); + __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); + __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]); + } + SAVE2x4_ACC (&acc0, 0); + SAVE2x4_ACC (&acc1, 2); + SAVE2x4_ACC (&acc2, 4); + SAVE2x4_ACC (&acc3, 6); + CO += 8; + AO += temp << 3; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 2) +#endif + } + i = (m & 7) >> 2; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 2); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + FLOAT t[4] = { 0, 0, 0, 0 }; + t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & t[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + vec_t *rowA = (vec_t *) & AO[l << 2]; + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); + } + SAVE2x4_ACC (&acc0, 0); + SAVE2x4_ACC (&acc1, 2); + CO += 4; + AO += temp << 2; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 2) +#endif + } + i = (m & 3) >> 1; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 2); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0; + __builtin_mma_xxsetaccz (&acc0); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + FLOAT t[4] = { 0, 0, 0, 0 }; + t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & t[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + vec_t *rowA = (vec_t *) & AO[l << 1]; + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + } + SAVE2x4_ACC (&acc0, 0); + CO += 2; + AO += temp << 1; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 2) +#endif + } + i = (m & 1) >> 0; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 2); +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowA = { AO[l], AO[l] }; + v4sf_t rowB = { BO[l << 1], BO[(l << 1) + 1] }; + t += rowA * rowB; + } + t = t * valpha; +#if defined(TRMMKERNEL) + CO[0 * ldc] = t[0]; + CO[1 * ldc] = t[1]; +#else + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; +#endif + CO += 1; + AO += temp; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 2) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; // number of values in A +#endif + B += k << 1; + } + N = (n & 1) >> 0; + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i, temp; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + FLOAT *CO; + FLOAT *AO; + CO = C; + C += ldc; + AO = A; + i = m; + while (i >= 16) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (16, 1) +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0 }; + v4sf_t t1 = { 0, 0 }; + v4sf_t t2 = { 0, 0 }; + v4sf_t t3 = { 0, 0 }; + v4sf_t t4 = { 0, 0 }; + v4sf_t t5 = { 0, 0 }; + v4sf_t t6 = { 0, 0 }; + v4sf_t t7 = { 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowB = { BO[l], BO[l] }; + v4sf_t rowA = { AO[l << 4], AO[(l << 4) + 1] }; + v4sf_t rowA1 = { AO[(l << 4) + 2], AO[(l << 4) + 3] }; + v4sf_t rowA2 = { AO[(l << 4) + 4], AO[(l << 4) + 5] }; + v4sf_t rowA3 = { AO[(l << 4) + 6], AO[(l << 4) + 7] }; + v4sf_t rowA4 = { AO[(l << 4) + 8], AO[(l << 4) + 9] }; + v4sf_t rowA5 = { AO[(l << 4) + 10], AO[(l << 4) + 11] }; + v4sf_t rowA6 = { AO[(l << 4) + 12], AO[(l << 4) + 13] }; + v4sf_t rowA7 = { AO[(l << 4) + 14], AO[(l << 4) + 15] }; + t += rowA * rowB; + t1 += rowA1 * rowB; + t2 += rowA2 * rowB; + t3 += rowA3 * rowB; + t4 += rowA4 * rowB; + t5 += rowA5 * rowB; + t6 += rowA6 * rowB; + t7 += rowA7 * rowB; + } + t = t * valpha; + t1 = t1 * valpha; + t2 = t2 * valpha; + t3 = t3 * valpha; + t4 = t4 * valpha; + t5 = t5 * valpha; + t6 = t6 * valpha; + t7 = t7 * valpha; +#if defined(TRMMKERNEL) + CO[0] = t[0]; + CO[1] = t[1]; + CO[2] = t1[0]; + CO[3] = t1[1]; + CO[4] = t2[0]; + CO[5] = t2[1]; + CO[6] = t3[0]; + CO[7] = t3[1]; + CO[8] = t4[0]; + CO[9] = t4[1]; + CO[10] = t5[0]; + CO[11] = t5[1]; + CO[12] = t6[0]; + CO[13] = t6[1]; + CO[14] = t7[0]; + CO[15] = t7[1]; +#else + CO[0] += t[0]; + CO[1] += t[1]; + CO[2] += t1[0]; + CO[3] += t1[1]; + CO[4] += t2[0]; + CO[5] += t2[1]; + CO[6] += t3[0]; + CO[7] += t3[1]; + CO[8] += t4[0]; + CO[9] += t4[1]; + CO[10] += t5[0]; + CO[11] += t5[1]; + CO[12] += t6[0]; + CO[13] += t6[1]; + CO[14] += t7[0]; + CO[15] += t7[1]; +#endif + AO += temp << 4; + BO += temp; + CO += 16; + i -= 16; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (16, 1) +#endif + } + while (i >= 8) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 1) +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0 }; + v4sf_t t1 = { 0, 0 }; + v4sf_t t2 = { 0, 0 }; + v4sf_t t3 = { 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowB = { BO[l], BO[l] }; + v4sf_t rowA = { AO[l << 3], AO[(l << 3) + 1] }; + v4sf_t rowA1 = { AO[(l << 3) + 2], AO[(l << 3) + 3] }; + v4sf_t rowA2 = { AO[(l << 3) + 4], AO[(l << 3) + 5] }; + v4sf_t rowA3 = { AO[(l << 3) + 6], AO[(l << 3) + 7] }; + t += rowA * rowB; + t1 += rowA1 * rowB; + t2 += rowA2 * rowB; + t3 += rowA3 * rowB; + } + t = t * valpha; + t1 = t1 * valpha; + t2 = t2 * valpha; + t3 = t3 * valpha; +#if defined(TRMMKERNEL) + CO[0] = t[0]; + CO[1] = t[1]; + CO[2] = t1[0]; + CO[3] = t1[1]; + CO[4] = t2[0]; + CO[5] = t2[1]; + CO[6] = t3[0]; + CO[7] = t3[1]; +#else + CO[0] += t[0]; + CO[1] += t[1]; + CO[2] += t1[0]; + CO[3] += t1[1]; + CO[4] += t2[0]; + CO[5] += t2[1]; + CO[6] += t3[0]; + CO[7] += t3[1]; +#endif + AO += temp << 3; + BO += temp; + CO += 8; + i -= 8; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 1) +#endif + } + while (i >= 4) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 1) +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0 }; + v4sf_t t1 = { 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowB = { BO[l], BO[l] }; + v4sf_t rowA = { AO[l << 2], AO[(l << 2) + 1] }; + v4sf_t rowA1 = { AO[(l << 2) + 2], AO[(l << 2) + 3] }; + t += rowA * rowB; + t1 += rowA1 * rowB; + } + t = t * valpha; + t1 = t1 * valpha; +#if defined(TRMMKERNEL) + CO[0] = t[0]; + CO[1] = t[1]; + CO[2] = t1[0]; + CO[3] = t1[1]; +#else + CO[0] += t[0]; + CO[1] += t[1]; + CO[2] += t1[0]; + CO[3] += t1[1]; +#endif + AO += temp << 2; + BO += temp; + CO += 4; + i -= 4; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 1) +#endif + } + while (i >= 2) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 1) +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowB = { BO[l], BO[l] }; + v4sf_t rowA = { AO[l << 1], AO[(l << 1) + 1] }; + t += rowA * rowB; + } + t = t * valpha; +#if defined(TRMMKERNEL) + CO[0] = t[0]; + CO[1] = t[1]; +#else + CO[0] += t[0]; + CO[1] += t[1]; +#endif + AO += temp << 1; + BO += temp; + CO += 2; + i -= 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 1) +#endif + } + while (i >= 1) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 1) +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + FLOAT t = 0; + for (l = 0; l < temp; l++) + { + t += AO[l] * BO[l]; + } + AO += temp; + BO += temp; +#if defined(TRMMKERNEL) + CO[0] = t * alpha; +#else + CO[0] += t * alpha; +#endif + CO += 1; + i -= 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 1) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; // number of values in A +#endif + B += k; + } + return 0; +} diff --git a/kernel/power/dgemv_n.c b/kernel/power/dgemv_n.c index 1a3d7669c..b4dfda550 100644 --- a/kernel/power/dgemv_n.c +++ b/kernel/power/dgemv_n.c @@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "dgemv_n_microk_power8.c" #endif diff --git a/kernel/power/drot.c b/kernel/power/drot.c index baeb54205..b808ab566 100644 --- a/kernel/power/drot.c +++ b/kernel/power/drot.c @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma GCC optimize "O1" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "drot_microk_power8.c" #endif diff --git a/kernel/power/dscal.c b/kernel/power/dscal.c index 779a08e9c..7e0fe48c0 100644 --- a/kernel/power/dscal.c +++ b/kernel/power/dscal.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "dscal_microk_power8.c" #endif diff --git a/kernel/power/dswap.c b/kernel/power/dswap.c index 52b7f50da..795bb10b4 100644 --- a/kernel/power/dswap.c +++ b/kernel/power/dswap.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "dswap_microk_power8.c" #endif diff --git a/kernel/power/sasum.c b/kernel/power/sasum.c index 5908347d3..b259d7d76 100644 --- a/kernel/power/sasum.c +++ b/kernel/power/sasum.c @@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "sasum_microk_power8.c" #endif diff --git a/kernel/power/scopy.c b/kernel/power/scopy.c index 5e3fe45a5..5207d386e 100644 --- a/kernel/power/scopy.c +++ b/kernel/power/scopy.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "scopy_microk_power8.c" #endif diff --git a/kernel/power/sdot.c b/kernel/power/sdot.c index ae527dde9..8de434e41 100644 --- a/kernel/power/sdot.c +++ b/kernel/power/sdot.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "sdot_microk_power8.c" #endif diff --git a/kernel/power/sgemm_kernel_power10.c b/kernel/power/sgemm_kernel_power10.c new file mode 100644 index 000000000..01c122c6d --- /dev/null +++ b/kernel/power/sgemm_kernel_power10.c @@ -0,0 +1,1334 @@ +/********************************************************************************* +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ +#include "common.h" +#include + +typedef unsigned char vec_t __attribute__ ((vector_size (16))); +typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); +typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); +#if defined(TRMMKERNEL) +#define SAVE_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] = result[3] * alpha; \ + rowC = (v4sf_t *) &CO[1*ldc+J]; \ + rowC[0] = result[2] * alpha; \ + rowC = (v4sf_t *) &CO[2*ldc+J]; \ + rowC[0] = result[1] * alpha; \ + rowC = (v4sf_t *) &CO[3*ldc+J]; \ + rowC[0] = result[0] * alpha; +#define SAVE_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[4* ldc+J]; \ + rowC[0] = result[3] * alpha; \ + rowC = (v4sf_t *) &CO[5*ldc+J]; \ + rowC[0] = result[2] * alpha; \ + rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC[0] = result[1] * alpha; \ + rowC = (v4sf_t *) &CO[7*ldc+J]; \ + rowC[0] = result[0] * alpha; +#define SAVE4x2_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v2sf_t *) &CO[0* ldc+J]; \ + rowC[0] = result[6] * alpha; \ + rowC = (v2sf_t *) &CO[1* ldc+J]; \ + rowC[0] = result[4] * alpha; \ + rowC = (v2sf_t *) &CO[2* ldc+J]; \ + rowC[0] = result[2] * alpha; \ + rowC = (v2sf_t *) &CO[3* ldc+J]; \ + rowC[0] = result[0] * alpha; +#define SAVE4x2_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v2sf_t *) &CO[4* ldc+J]; \ + rowC[0] = result[6] * alpha; \ + rowC = (v2sf_t *) &CO[5* ldc+J]; \ + rowC[0] = result[4] * alpha; \ + rowC = (v2sf_t *) &CO[6* ldc+J]; \ + rowC[0] = result[2] * alpha; \ + rowC = (v2sf_t *) &CO[7* ldc+J]; \ + rowC[0] = result[0] * alpha; +#define SAVE2x4_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] = result[3] * alpha; \ + rowC = (v4sf_t *) &CO[1* ldc+J]; \ + rowC[0] = result[2] * alpha; +#else +#define SAVE_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] += result[3] * alpha; \ + rowC = (v4sf_t *) &CO[1*ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v4sf_t *) &CO[2*ldc+J]; \ + rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[3*ldc+J]; \ + rowC[0] += result[0] * alpha; +#define SAVE_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[4* ldc+J]; \ + rowC[0] += result[3] * alpha; \ + rowC = (v4sf_t *) &CO[5*ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[7*ldc+J]; \ + rowC[0] += result[0] * alpha; +#define SAVE4x2_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v2sf_t *) &CO[0* ldc+J]; \ + rowC[0] += result[6] * alpha; \ + rowC = (v2sf_t *) &CO[1* ldc+J]; \ + rowC[0] += result[4] * alpha; \ + rowC = (v2sf_t *) &CO[2* ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v2sf_t *) &CO[3* ldc+J]; \ + rowC[0] += result[0] * alpha; +#define SAVE4x2_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v2sf_t *) &CO[4* ldc+J]; \ + rowC[0] += result[6] * alpha; \ + rowC = (v2sf_t *) &CO[5* ldc+J]; \ + rowC[0] += result[4] * alpha; \ + rowC = (v2sf_t *) &CO[6* ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v2sf_t *) &CO[7* ldc+J]; \ + rowC[0] += result[0] * alpha; +#define SAVE2x4_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] += result[3] * alpha; \ + rowC = (v4sf_t *) &CO[1* ldc+J]; \ + rowC[0] += result[2] * alpha; +#endif +#define KERNEL(i, j) \ + __builtin_mma_xvf32gerpp (&acc0, rowB[i], rowA[j]); \ + __builtin_mma_xvf32gerpp (&acc1, rowB[i+1], rowA[j]); \ + __builtin_mma_xvf32gerpp (&acc2, rowB[i], rowA[j+1]); \ + __builtin_mma_xvf32gerpp (&acc3, rowB[i+1], rowA[j+1]); \ + __builtin_mma_xvf32gerpp (&acc4, rowB[i], rowA[j+2]); \ + __builtin_mma_xvf32gerpp (&acc5, rowB[i+1], rowA[j+2]); \ + __builtin_mma_xvf32gerpp (&acc6, rowB[i], rowA[j+3]); \ + __builtin_mma_xvf32gerpp (&acc7, rowB[i+1], rowA[j+3]); +#define SET_ACC_ZERO4() \ + __builtin_mma_xxsetaccz (&acc0); \ + __builtin_mma_xxsetaccz (&acc1); \ + __builtin_mma_xxsetaccz (&acc2); \ + __builtin_mma_xxsetaccz (&acc3); + +#define SET_ACC_ZERO8() \ + __builtin_mma_xxsetaccz (&acc0); \ + __builtin_mma_xxsetaccz (&acc1); \ + __builtin_mma_xxsetaccz (&acc2); \ + __builtin_mma_xxsetaccz (&acc3); \ + __builtin_mma_xxsetaccz (&acc4); \ + __builtin_mma_xxsetaccz (&acc5); \ + __builtin_mma_xxsetaccz (&acc6); \ + __builtin_mma_xxsetaccz (&acc7); + +#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +#define REFRESH_TEMP_BK(x, y) \ + temp = k - off; +#elif defined(LEFT) +#define REFRESH_TEMP_BK(x, y) \ + temp = off + x; +#else +#define REFRESH_TEMP_BK(x, y) \ + temp = off + y; +#endif +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +#define REFRESH_POINTERS(x, y) \ + BO = B; \ + REFRESH_TEMP_BK(x, y) +#else +#define REFRESH_POINTERS(x, y) \ + AO += off * x; \ + BO = B + off * y; \ + REFRESH_TEMP_BK(x, y) +#endif + +#ifdef LEFT +#define REFRESH_OFF(x) \ + off += x; +#else +#define REFRESH_OFF(x) +#endif + +#ifdef LEFT +#define UPDATE_TEMP(x, y) \ + temp -= x; +#else +#define UPDATE_TEMP(x, y) \ + temp -= y; +#endif + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +#define REFRESH_TMP_AFTER_SAVE(x, y) \ + temp = k - off; \ + UPDATE_TEMP(x, y) \ + AO += temp * x; \ + BO += temp * y; +#else +#define REFRESH_TMP_AFTER_SAVE(x, y) +#endif + +#define REFRESH_AFTER_SAVE(x,y) \ + REFRESH_TMP_AFTER_SAVE(x, y) \ + REFRESH_OFF(x) +/************************************************************************************* +* GEMM Kernel +*************************************************************************************/ +int +CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, + FLOAT * C, BLASLONG ldc +#ifdef TRMMKERNEL + , BLASLONG offset +#endif + ) +{ + BLASLONG N = n; + BLASLONG i1; +#if defined(TRMMKERNEL) + BLASLONG off; +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#endif + + v4sf_t valpha = { alpha, alpha, alpha, alpha }; + N = n >> 3; + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i, j, temp; + FLOAT *CO; + FLOAT *AO; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + CO = C; + C += ldc << 3; + AO = A; + PREFETCH1 (A, 128); + PREFETCH1 (A, 256); + i = m >> 4; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (16, 8); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + SET_ACC_ZERO8 (); + BLASLONG l = 0; + BLASLONG K = temp / 64; + for (l = 0; l < K; l++) + { + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + KERNEL (0, 0); + KERNEL (2, 4); + KERNEL (4, 8); + KERNEL (6, 12); + KERNEL (8, 16); + KERNEL (10, 20); + KERNEL (12, 24); + KERNEL (14, 28); + KERNEL (16, 32); + KERNEL (18, 36); + KERNEL (20, 40); + KERNEL (22, 44); + KERNEL (24, 48); + KERNEL (26, 52); + KERNEL (28, 56); + KERNEL (30, 60); + KERNEL (32, 64); + KERNEL (34, 68); + KERNEL (36, 72); + KERNEL (38, 76); + KERNEL (40, 80); + KERNEL (42, 84); + KERNEL (44, 88); + KERNEL (46, 92); + KERNEL (48, 96); + KERNEL (50, 100); + KERNEL (52, 104); + KERNEL (54, 108); + KERNEL (56, 112); + KERNEL (58, 116); + KERNEL (60, 120); + KERNEL (62, 124); + KERNEL (64, 128); + KERNEL (66, 132); + KERNEL (68, 136); + KERNEL (70, 140); + KERNEL (72, 144); + KERNEL (74, 148); + KERNEL (76, 152); + KERNEL (78, 156); + KERNEL (80, 160); + KERNEL (82, 164); + KERNEL (84, 168); + KERNEL (86, 172); + KERNEL (88, 176); + KERNEL (90, 180); + KERNEL (92, 184); + KERNEL (94, 188); + KERNEL (96, 192); + KERNEL (98, 196); + KERNEL (100, 200); + KERNEL (102, 204); + KERNEL (104, 208); + KERNEL (106, 212); + KERNEL (108, 216); + KERNEL (110, 220); + KERNEL (112, 224); + KERNEL (114, 228); + KERNEL (116, 232); + KERNEL (118, 236); + KERNEL (120, 240); + KERNEL (122, 244); + KERNEL (124, 248); + KERNEL (126, 252); + AO += 1024; + BO += 512; + } + if ((temp & 63) >> 5) + { + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + KERNEL (0, 0); + KERNEL (2, 4); + KERNEL (4, 8); + KERNEL (6, 12); + KERNEL (8, 16); + KERNEL (10, 20); + KERNEL (12, 24); + KERNEL (14, 28); + KERNEL (16, 32); + KERNEL (18, 36); + KERNEL (20, 40); + KERNEL (22, 44); + KERNEL (24, 48); + KERNEL (26, 52); + KERNEL (28, 56); + KERNEL (30, 60); + KERNEL (32, 64); + KERNEL (34, 68); + KERNEL (36, 72); + KERNEL (38, 76); + KERNEL (40, 80); + KERNEL (42, 84); + KERNEL (44, 88); + KERNEL (46, 92); + KERNEL (48, 96); + KERNEL (50, 100); + KERNEL (52, 104); + KERNEL (54, 108); + KERNEL (56, 112); + KERNEL (58, 116); + KERNEL (60, 120); + KERNEL (62, 124); + AO += 512; + BO += 256; + } + if ((temp & 31) >> 4) + { + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + KERNEL (0, 0); + KERNEL (2, 4); + KERNEL (4, 8); + KERNEL (6, 12); + KERNEL (8, 16); + KERNEL (10, 20); + KERNEL (12, 24); + KERNEL (14, 28); + KERNEL (16, 32); + KERNEL (18, 36); + KERNEL (20, 40); + KERNEL (22, 44); + KERNEL (24, 48); + KERNEL (26, 52); + KERNEL (28, 56); + KERNEL (30, 60); + AO += 256; + BO += 128; + } + if ((temp & 15) >> 3) + { + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + KERNEL (0, 0); + KERNEL (2, 4); + KERNEL (4, 8); + KERNEL (6, 12); + KERNEL (8, 16); + KERNEL (10, 20); + KERNEL (12, 24); + KERNEL (14, 28); + AO += 128; + BO += 64; + } + if ((temp & 7) >> 2) + { + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + KERNEL (0, 0); + KERNEL (2, 4); + KERNEL (4, 8); + KERNEL (6, 12); + AO += 64; + BO += 32; + } + if ((temp & 3) >> 1) + { + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + KERNEL (0, 0); + KERNEL (2, 4); + AO += 32; + BO += 16; + } + if ((temp & 1) >> 0) + { + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + KERNEL (0, 0); + AO += 16; + BO += 8; + } + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc2, 4); + SAVE_ACC1 (&acc1, 0); + SAVE_ACC1 (&acc3, 4); + SAVE_ACC (&acc4, 8); + SAVE_ACC (&acc6, 12); + SAVE_ACC1 (&acc5, 8); + SAVE_ACC1 (&acc7, 12); +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (16, 8) +#endif + CO += 16; + } + i = (m & 15) >> 3; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 8); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3; + SET_ACC_ZERO4 (); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 3]; + vec_t *rowB = (vec_t *) & BO[l << 3]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]); + __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[1]); + __builtin_mma_xvf32gerpp (&acc3, rowB[1], rowA[1]); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc2, 4); + SAVE_ACC1 (&acc1, 0); + SAVE_ACC1 (&acc3, 4); + AO += (temp << 3); + BO += (temp << 3); + CO += 8; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 8) +#endif + } + i = (m & 7) >> 2; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 8); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 2]; + vec_t *rowB = (vec_t *) & BO[l << 3]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC1 (&acc1, 0); + CO += 4; + AO += (temp << 2); + BO += (temp << 3); +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 8) +#endif + } + i = (m & 3) >> 1; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 8); +#else + BO = B; + temp = k; +#endif + + v2sf_t *rowC; + v2sf_t result[8]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + FLOAT t[4] = { 0 }; + t[0] = AO[l << 1], t[1] = AO[(l << 1) + 1]; + vec_t *rowA = (vec_t *) & t[0]; + vec_t *rowB = (vec_t *) & BO[l << 3]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]); + } + SAVE4x2_ACC (&acc0, 0); + SAVE4x2_ACC1 (&acc1, 0); + CO += 2; + AO += (temp << 1); + BO += (temp << 3); +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 8) +#endif + } + i = (m & 1) >> 0; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 8); +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; + v4sf_t t1 = { 0, 0, 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowA = { AO[l], AO[l], AO[l], AO[l] }; + v4sf_t rowB = { BO[l << 3], BO[(l << 3) + 1], BO[(l << 3) + 2], + BO[(l << 3) + 3] + }; + v4sf_t rowB1 = + { BO[(l << 3) + 4], BO[(l << 3) + 5], BO[(l << 3) + 6], + BO[(l << 3) + 7] + }; + t += rowA * rowB; + t1 += rowA * rowB1; + } + t = t * valpha; + t1 = t1 * valpha; +#if defined(TRMMKERNEL) + CO[0 * ldc] = t[0]; + CO[1 * ldc] = t[1]; + CO[2 * ldc] = t[2]; + CO[3 * ldc] = t[3]; + CO[4 * ldc] = t1[0]; + CO[5 * ldc] = t1[1]; + CO[6 * ldc] = t1[2]; + CO[7 * ldc] = t1[3]; +#else + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; + CO[2 * ldc] += t[2]; + CO[3 * ldc] += t[3]; + CO[4 * ldc] += t1[0]; + CO[5 * ldc] += t1[1]; + CO[6 * ldc] += t1[2]; + CO[7 * ldc] += t1[3]; +#endif + CO += 1; + AO += temp; + BO += (temp << 3); +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 8) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 8; // number of values in A +#endif + + B += k << 3; + } + N = (n & 7) >> 2; + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i, j, temp; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + FLOAT *CO; + FLOAT *AO; + CO = C; + C += ldc << 2; + AO = A; +#if !defined(TRMMKERNEL) + i = m >> 5; + for (j = 0; j < i; j++) + { + FLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + FLOAT *A1; + A1 = AO + (16 * k); + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + SET_ACC_ZERO8 (); + BLASLONG l = 0; + for (l = 0; l < k; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 4]; + vec_t *rowA1 = (vec_t *) & A1[l << 4]; + vec_t *rowB = (vec_t *) & BO[l << 2]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); + __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]); + __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]); + __builtin_mma_xvf32gerpp (&acc4, rowB[0], rowA1[0]); + __builtin_mma_xvf32gerpp (&acc5, rowB[0], rowA1[1]); + __builtin_mma_xvf32gerpp (&acc6, rowB[0], rowA1[2]); + __builtin_mma_xvf32gerpp (&acc7, rowB[0], rowA1[3]); + } + + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc1, 4); + CO += 8; + SAVE_ACC (&acc2, 0); + SAVE_ACC (&acc3, 4); + CO += 8; + SAVE_ACC (&acc4, 0); + SAVE_ACC (&acc5, 4); + CO += 8; + SAVE_ACC (&acc6, 0); + SAVE_ACC (&acc7, 4); + CO += 8; + AO += k << 5; + BO += k << 2; + } + i = (m & 31) >> 4; +#else + i = m >> 4; +#endif + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (16, 4); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3; + SET_ACC_ZERO4 (); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 4]; + vec_t *rowB = (vec_t *) & BO[l << 2]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); + __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]); + __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]); + } + + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc1, 4); + CO += 8; + SAVE_ACC (&acc2, 0); + SAVE_ACC (&acc3, 4); + CO += 8; + AO += temp << 4; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (16, 4) +#endif + } + i = (m & 15) >> 3; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 4); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 3]; + vec_t *rowB = (vec_t *) & BO[l << 2]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc1, 4); + CO += 8; + AO += temp << 3; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 4) +#endif + } + i = (m & 7) >> 2; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 4); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + __vector_quad acc0; + v4sf_t result[4]; + __builtin_mma_xxsetaccz (&acc0); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 2]; + vec_t *rowB = (vec_t *) & BO[l << 2]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + } + SAVE_ACC (&acc0, 0); + CO += 4; + AO += temp << 2; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 4) +#endif + } + i = (m & 3) >> 1; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 4); +#else + BO = B; + temp = k; +#endif + v2sf_t *rowC; + v2sf_t result[8]; + __vector_quad acc0; + __builtin_mma_xxsetaccz (&acc0); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + FLOAT t[4] = { 0 }; + t[0] = AO[l << 1], t[1] = AO[(l << 1) + 1]; + vec_t *rowA = (vec_t *) & t[0]; + vec_t *rowB = (vec_t *) & BO[l << 2]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + } + SAVE4x2_ACC (&acc0, 0); + CO += 2; + AO += temp << 1; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 4) +#endif + } + i = (m & 1) >> 0; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 4) +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowA = { AO[l], AO[l], AO[l], AO[l] }; + v4sf_t rowB = { BO[l << 2], BO[(l << 2) + 1], BO[(l << 2) + 2], + BO[(l << 2) + 3] + }; + t += rowA * rowB; + } + t = t * valpha; +#if defined(TRMMKERNEL) + CO[0 * ldc] = t[0]; + CO[1 * ldc] = t[1]; + CO[2 * ldc] = t[2]; + CO[3 * ldc] = t[3]; +#else + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; + CO[2 * ldc] += t[2]; + CO[3 * ldc] += t[3]; +#endif + CO += 1; + AO += temp; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 4) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 4; // number of values in A +#endif + + B += k << 2; + } + N = (n & 3) >> 1; + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i, j, temp; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + FLOAT *CO; + FLOAT *AO; + CO = C; + C += ldc << 1; + AO = A; +#if !defined(TRMMKERNEL) + i = m >> 5; + for (j = 0; j < i; j++) + { + FLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + FLOAT *A1; + A1 = AO + (16 * k); + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + SET_ACC_ZERO8 (); + BLASLONG l = 0; + for (l = 0; l < k; l++) + { + FLOAT t[4] = { 0 }; + t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; + vec_t *rowB = (vec_t *) & t[0]; + vec_t *rowA = (vec_t *) & AO[l << 4]; + vec_t *rowA1 = (vec_t *) & A1[l << 4]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); + __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]); + __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]); + __builtin_mma_xvf32gerpp (&acc4, rowB[0], rowA1[0]); + __builtin_mma_xvf32gerpp (&acc5, rowB[0], rowA1[1]); + __builtin_mma_xvf32gerpp (&acc6, rowB[0], rowA1[2]); + __builtin_mma_xvf32gerpp (&acc7, rowB[0], rowA1[3]); + } + SAVE2x4_ACC (&acc0, 0); + SAVE2x4_ACC (&acc1, 4); + SAVE2x4_ACC (&acc2, 8); + SAVE2x4_ACC (&acc3, 12); + CO += 16; + SAVE2x4_ACC (&acc4, 0); + SAVE2x4_ACC (&acc5, 4); + SAVE2x4_ACC (&acc6, 8); + SAVE2x4_ACC (&acc7, 12); + CO += 16; + AO += k << 5; + BO += k << 1; + } + i = (m & 31) >> 4; +#else + i = m >> 4; +#endif + for (j = 0; j < i; j++) + { + FLOAT *BO; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3; + SET_ACC_ZERO4 (); + BLASLONG l = 0; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (16, 2) +#else + BO = B; + temp = k; +#endif + for (l = 0; l < temp; l++) + { + FLOAT t[4] = { 0 }; + t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; + vec_t *rowB = (vec_t *) & t[0]; + vec_t *rowA = (vec_t *) & AO[l << 4]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); + __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]); + __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]); + } + SAVE2x4_ACC (&acc0, 0); + SAVE2x4_ACC (&acc1, 4); + SAVE2x4_ACC (&acc2, 8); + SAVE2x4_ACC (&acc3, 12); + CO += 16; + AO += temp << 4; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (16, 2) +#endif + } + i = (m & 15) >> 3; + for (j = 0; j < i; j++) + { + FLOAT *BO; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 2) +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + FLOAT t[4] = { 0 }; + t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; + vec_t *rowB = (vec_t *) & t[0]; + vec_t *rowA = (vec_t *) & AO[l << 3]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); + } + SAVE2x4_ACC (&acc0, 0); + SAVE2x4_ACC (&acc1, 4); + CO += 8; + AO += temp << 3; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 2) +#endif + } + i = (m & 7) >> 2; + for (j = 0; j < i; j++) + { + FLOAT *BO; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0; + __builtin_mma_xxsetaccz (&acc0); +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 2) +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + FLOAT t[4] = { 0 }; + t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; + vec_t *rowB = (vec_t *) & t[0]; + vec_t *rowA = (vec_t *) & AO[l << 2]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + } + SAVE2x4_ACC (&acc0, 0); + CO += 4; + AO += temp << 2; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 2) +#endif + } + i = (m & 3) >> 1; + for (j = 0; j < i; j++) + { + FLOAT *BO; + BLASLONG l = 0; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 2) +#else + BO = B; + temp = k; +#endif + v4sf_t t = { 0, 0, 0, 0 }; + for (l = 0; l < (temp << 1); l += 2) + { + v4sf_t rowA = { AO[l], AO[l], AO[l + 1], AO[l + 1] }; + v4sf_t rowB = { BO[l], BO[l + 1], BO[l], BO[l + 1] }; + t += rowA * rowB; + } + t = t * valpha; +#if defined(TRMMKERNEL) + CO[0 * ldc] = t[0]; + CO[1 * ldc] = t[1]; + CO[0 * ldc + 1] = t[2]; + CO[1 * ldc + 1] = t[3]; +#else + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; + CO[0 * ldc + 1] += t[2]; + CO[1 * ldc + 1] += t[3]; +#endif + CO += 2; + AO += temp << 1; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 2) +#endif + } + i = (m & 1) >> 0; + for (j = 0; j < i; j++) + { + FLOAT *BO; + BLASLONG l = 0; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 2) +#else + BO = B; + temp = k; +#endif + v4sf_t t = { 0, 0, 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowA = { AO[l], AO[l], 0, 0 }; + v4sf_t rowB = { BO[l << 1], BO[(l << 1) + 1], 0, 0 }; + t += rowA * rowB; + } + t = t * valpha; +#if defined(TRMMKERNEL) + CO[0 * ldc] = t[0]; + CO[1 * ldc] = t[1]; +#else + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; +#endif + CO += 1; + AO += temp; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 2) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; // number of values in A +#endif + + B += k << 1; + } + N = (n & 1) >> 0; + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i, temp; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + FLOAT *CO; + FLOAT *AO; + CO = C; + C += ldc; + AO = A; + i = m; + while (i >= 16) + { + FLOAT *BO; + BLASLONG l = 0; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (16, 1) +#else + BO = B; + temp = k; +#endif + + v4sf_t t = { 0, 0, 0, 0 }; + v4sf_t t1 = { 0, 0, 0, 0 }; + v4sf_t t2 = { 0, 0, 0, 0 }; + v4sf_t t3 = { 0, 0, 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowB = { BO[l], BO[l], BO[l], BO[l] }; + v4sf_t rowA = { AO[l << 4], AO[(l << 4) + 1], AO[(l << 4) + 2], + AO[(l << 4) + 3] + }; + v4sf_t rowA1 = + { AO[(l << 4) + 4], AO[(l << 4) + 5], AO[(l << 4) + 6], + AO[(l << 4) + 7] + }; + v4sf_t rowA2 = + { AO[(l << 4) + 8], AO[(l << 4) + 9], AO[(l << 4) + 10], + AO[(l << 4) + 11] + }; + v4sf_t rowA3 = + { AO[(l << 4) + 12], AO[(l << 4) + 13], AO[(l << 4) + 14], + AO[(l << 4) + 15] + }; + t += rowA * rowB; + t1 += rowA1 * rowB; + t2 += rowA2 * rowB; + t3 += rowA3 * rowB; + } + t = t * valpha; + t1 = t1 * valpha; + t2 = t2 * valpha; + t3 = t3 * valpha; +#if defined(TRMMKERNEL) + CO[0] = t[0]; + CO[1] = t[1]; + CO[2] = t[2]; + CO[3] = t[3]; + CO[4] = t1[0]; + CO[5] = t1[1]; + CO[6] = t1[2]; + CO[7] = t1[3]; + CO[8] = t2[0]; + CO[9] = t2[1]; + CO[10] = t2[2]; + CO[11] = t2[3]; + CO[12] = t3[0]; + CO[13] = t3[1]; + CO[14] = t3[2]; + CO[15] = t3[3]; +#else + CO[0] += t[0]; + CO[1] += t[1]; + CO[2] += t[2]; + CO[3] += t[3]; + CO[4] += t1[0]; + CO[5] += t1[1]; + CO[6] += t1[2]; + CO[7] += t1[3]; + CO[8] += t2[0]; + CO[9] += t2[1]; + CO[10] += t2[2]; + CO[11] += t2[3]; + CO[12] += t3[0]; + CO[13] += t3[1]; + CO[14] += t3[2]; + CO[15] += t3[3]; +#endif + AO += temp << 4; + BO += temp; + CO += 16; + i -= 16; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (16, 1) +#endif + } + while (i >= 8) + { + FLOAT *BO; + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; + v4sf_t t1 = { 0, 0, 0, 0 }; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 1) +#else + BO = B; + temp = k; +#endif + + for (l = 0; l < temp; l++) + { + v4sf_t rowB = { BO[l], BO[l], BO[l], BO[l] }; + v4sf_t rowA = { AO[l << 3], AO[(l << 3) + 1], AO[(l << 3) + 2], + AO[(l << 3) + 3] + }; + v4sf_t rowA1 = + { AO[(l << 3) + 4], AO[(l << 3) + 5], AO[(l << 3) + 6], + AO[(l << 3) + 7] + }; + t += rowA * rowB; + t1 += rowA1 * rowB; + } + t = t * valpha; + t1 = t1 * valpha; +#if defined(TRMMKERNEL) + CO[0] = t[0]; + CO[1] = t[1]; + CO[2] = t[2]; + CO[3] = t[3]; + CO[4] = t1[0]; + CO[5] = t1[1]; + CO[6] = t1[2]; + CO[7] = t1[3]; +#else + CO[0] += t[0]; + CO[1] += t[1]; + CO[2] += t[2]; + CO[3] += t[3]; + CO[4] += t1[0]; + CO[5] += t1[1]; + CO[6] += t1[2]; + CO[7] += t1[3]; +#endif + AO += temp << 3; + BO += temp; + CO += 8; + i -= 8; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 1) +#endif + } + while (i >= 4) + { + FLOAT *BO; + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 1) +#else + BO = B; + temp = k; +#endif + + for (l = 0; l < temp; l++) + { + v4sf_t rowB = { BO[l], BO[l], BO[l], BO[l] }; + v4sf_t rowA = { AO[l << 2], AO[(l << 2) + 1], AO[(l << 2) + 2], + AO[(l << 2) + 3] + }; + t += rowA * rowB; + } + t = t * valpha; +#if defined(TRMMKERNEL) + CO[0] = t[0]; + CO[1] = t[1]; + CO[2] = t[2]; + CO[3] = t[3]; +#else + CO[0] += t[0]; + CO[1] += t[1]; + CO[2] += t[2]; + CO[3] += t[3]; +#endif + AO += temp << 2; + BO += temp; + CO += 4; + i -= 4; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 1) +#endif + } + while (i >= 2) + { + FLOAT *BO; + BLASLONG l = 0; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 1) +#else + BO = B; + temp = k; +#endif + + v4sf_t t = { 0, 0, 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowB = { BO[l], BO[l], 0, 0 }; + v4sf_t rowA = { AO[l << 1], AO[(l << 1) + 1], 0, 0 }; + t += rowA * rowB; + } + t = t * valpha; +#if defined(TRMMKERNEL) + CO[0] = t[0]; + CO[1] = t[1]; +#else + CO[0] += t[0]; + CO[1] += t[1]; +#endif + AO += temp << 1; + BO += temp; + CO += 2; + i -= 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 1) +#endif + } + while (i >= 1) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 1) +#else + BO = B; + temp = k; +#endif + + BLASLONG l = 0; + FLOAT t = 0; + for (l = 0; l < temp; l++) + { + t += AO[l] * BO[l]; + } + AO += temp; + BO += temp; +#if defined(TRMMKERNEL) + CO[0] = t * alpha; +#else + CO[0] += t * alpha; +#endif + CO += 1; + i -= 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 1) +#endif + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; // number of values in A +#endif + B += k; + } + return 0; +} diff --git a/kernel/power/shgemm_kernel_power10.c b/kernel/power/shgemm_kernel_power10.c new file mode 100644 index 000000000..7455f925c --- /dev/null +++ b/kernel/power/shgemm_kernel_power10.c @@ -0,0 +1,1044 @@ +/********************************************************************************* +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ +#include "common.h" +#include +#if defined(HALF) && defined(HALFCONVERSION) +static float +bfloat16tof32 (bfloat16 f16) +{ + float result = 0; + unsigned short *q = (unsigned short *) (&result); +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + q[0] = f16; +#else + q[1] = f16; +#endif + return result; +} + +#define BF16TOF32(x) (bfloat16tof32(x)) +#else +#define BF16TOF32(x) x +#endif + +typedef unsigned char vec_t __attribute__ ((vector_size (16))); +typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); +typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); + +vector char mask = + { 0x0, 0x1, 0x8, 0x9, 0x2, 0x3, 0xa, 0xb, 0x4, 0x5, 0xc, 0xd, 0x6, 0x7, 0xe, + 0xf +}; + +/* + * BFLOAT16 xvbf16ger2pp instruction needs 4×2 matrix of + * bfloat16 floating-point values as input. Hence this + * merging is needed on A and B matrices. + */ +#define MERGE_ROW(x) vec_perm(x, x, mask) +#define MERGE_HIGH(x, y) (vec_t) vec_mergeh ((vector short)x, (vector short)y) +#define MERGE_LOW(x, y) (vec_t) vec_mergel ((vector short)x, (vector short)y) + +#define SAVE_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] += result[3] * alpha; \ + rowC = (v4sf_t *) &CO[1*ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v4sf_t *) &CO[2*ldc+J]; \ + rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[3*ldc+J]; \ + rowC[0] += result[0] * alpha; +#define SAVE_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[4* ldc+J]; \ + rowC[0] += result[3] * alpha; \ + rowC = (v4sf_t *) &CO[5*ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[7*ldc+J]; \ + rowC[0] += result[0] * alpha; +#define SAVE4x2_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v2sf_t *) &CO[0* ldc+J]; \ + rowC[0] += result[6] * alpha; \ + rowC = (v2sf_t *) &CO[1* ldc+J]; \ + rowC[0] += result[4] * alpha; \ + rowC = (v2sf_t *) &CO[2* ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v2sf_t *) &CO[3* ldc+J]; \ + rowC[0] += result[0] * alpha; +#define SAVE4x2_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v2sf_t *) &CO[4* ldc+J]; \ + rowC[0] += result[6] * alpha; \ + rowC = (v2sf_t *) &CO[5* ldc+J]; \ + rowC[0] += result[4] * alpha; \ + rowC = (v2sf_t *) &CO[6* ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v2sf_t *) &CO[7* ldc+J]; \ + rowC[0] += result[0] * alpha; + +#define MMA __builtin_mma_xvbf16ger2pp + +#define SAVE2x4_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] += result[3] * alpha; \ + rowC = (v4sf_t *) &CO[1* ldc+J]; \ + rowC[0] += result[2] * alpha; + +#define SET_ACC_ZERO4() \ + __builtin_mma_xxsetaccz (&acc0); \ + __builtin_mma_xxsetaccz (&acc1); \ + __builtin_mma_xxsetaccz (&acc2); \ + __builtin_mma_xxsetaccz (&acc3); + +#define SET_ACC_ZERO8() \ + __builtin_mma_xxsetaccz (&acc0); \ + __builtin_mma_xxsetaccz (&acc1); \ + __builtin_mma_xxsetaccz (&acc2); \ + __builtin_mma_xxsetaccz (&acc3); \ + __builtin_mma_xxsetaccz (&acc4); \ + __builtin_mma_xxsetaccz (&acc5); \ + __builtin_mma_xxsetaccz (&acc6); \ + __builtin_mma_xxsetaccz (&acc7); + +#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); +/************************************************************************************* +* SHGEMM Kernel +*************************************************************************************/ +int +CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, + IFLOAT * B, FLOAT * C, BLASLONG ldc) +{ + BLASLONG N = n; + BLASLONG i1; + v4sf_t valpha = { alpha, alpha, alpha, alpha }; + vector short vzero = { 0, 0, 0, 0, 0, 0, 0, 0 }; + N = n >> 3; + /* Loop for n >= 8. */ + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i, j; + FLOAT *CO; + IFLOAT *AO; + CO = C; + C += ldc << 3; + AO = A; + PREFETCH1 (A, 128); + PREFETCH1 (A, 256); + i = m >> 4; + /* Loop for m >= 16. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + SET_ACC_ZERO8 (); + BLASLONG l = 0; + for (l = 0; l < k / 2; l++) + { + vec_t *rowA = (vec_t *) & (AO[l << 5]); + vec_t *rowB = (vec_t *) & (BO[l << 4]); + vec_t rowB_h = MERGE_HIGH (rowB[0], rowB[1]); + vec_t rowB_l = MERGE_LOW (rowB[0], rowB[1]); + vec_t rowA_h = MERGE_HIGH (rowA[0], rowA[2]); + vec_t rowA_l = MERGE_LOW (rowA[0], rowA[2]); + vec_t rowA2_h = MERGE_HIGH (rowA[1], rowA[3]); + vec_t rowA2_l = MERGE_LOW (rowA[1], rowA[3]); + MMA (&acc0, rowB_h, rowA_h); + MMA (&acc1, rowB_l, rowA_h); + MMA (&acc2, rowB_h, rowA_l); + MMA (&acc3, rowB_l, rowA_l); + MMA (&acc4, rowB_h, rowA2_h); + MMA (&acc5, rowB_l, rowA2_h); + MMA (&acc6, rowB_h, rowA2_l); + MMA (&acc7, rowB_l, rowA2_l); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 4; + vec_t *rowA = (vec_t *) & (AO[l << 1]); + vec_t *rowB = (vec_t *) & (BO[l]); + vec_t rowB_h = MERGE_HIGH (rowB[0], rowB[1]); + vec_t rowB_l = MERGE_LOW (rowB[0], rowB[1]); + vec_t rowA_h = MERGE_HIGH (rowA[0], vzero); + vec_t rowA_l = MERGE_LOW (rowA[0], vzero); + vec_t rowA2_h = MERGE_HIGH (rowA[1], vzero); + vec_t rowA2_l = MERGE_LOW (rowA[1], vzero); + MMA (&acc0, rowB_h, rowA_h); + MMA (&acc1, rowB_l, rowA_h); + MMA (&acc2, rowB_h, rowA_l); + MMA (&acc3, rowB_l, rowA_l); + MMA (&acc4, rowB_h, rowA2_h); + MMA (&acc5, rowB_l, rowA2_h); + MMA (&acc6, rowB_h, rowA2_l); + MMA (&acc7, rowB_l, rowA2_l); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc2, 4); + SAVE_ACC1 (&acc1, 0); + SAVE_ACC1 (&acc3, 4); + SAVE_ACC (&acc4, 8); + SAVE_ACC (&acc6, 12); + SAVE_ACC1 (&acc5, 8); + SAVE_ACC1 (&acc7, 12); + CO += 16; + + AO += (k << 4); + BO += (k << 3); + } + i = (m & 15) >> 3; + /* Loop for m >= 8. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3; + SET_ACC_ZERO4 (); + BLASLONG l = 0; + for (l = 0; l < k / 2; l++) + { + vec_t *rowA = (vec_t *) & (AO[l << 4]); + vec_t *rowB = (vec_t *) & (BO[l << 4]); + vec_t rowB_h = MERGE_HIGH (rowB[0], rowB[1]); + vec_t rowB_l = MERGE_LOW (rowB[0], rowB[1]); + vec_t rowA_h = MERGE_HIGH (rowA[0], rowA[1]); + vec_t rowA_l = MERGE_LOW (rowA[0], rowA[1]); + MMA (&acc0, rowB_h, rowA_h); + MMA (&acc1, rowB_l, rowA_h); + MMA (&acc2, rowB_h, rowA_l); + MMA (&acc3, rowB_l, rowA_l); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 4; + vec_t *rowA = (vec_t *) & (AO[l]); + vec_t *rowB = (vec_t *) & (BO[l]); + vec_t rowB_h = MERGE_HIGH (rowB[0], rowB[1]); + vec_t rowB_l = MERGE_LOW (rowB[0], rowB[1]); + vec_t rowA_h = MERGE_HIGH (rowA[0], vzero); + vec_t rowA_l = MERGE_LOW (rowA[0], vzero); + MMA (&acc0, rowB_h, rowA_h); + MMA (&acc1, rowB_l, rowA_h); + MMA (&acc2, rowB_h, rowA_l); + MMA (&acc3, rowB_l, rowA_l); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc2, 4); + SAVE_ACC1 (&acc1, 0); + SAVE_ACC1 (&acc3, 4); + CO += 8; + AO += (k << 3); + BO += (k << 3); + } + i = (m & 7) >> 2; + /* Loop for m >= 4. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + BLASLONG l = 0; + for (l = 0; l < k / 2; l++) + { + vec_t *rowA = (vec_t *) & (AO[l << 3]); + vec_t *rowB = (vec_t *) & (BO[l << 4]); + vec_t rowA_mrg = MERGE_ROW (rowA[0]); + MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), rowA_mrg); + MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), rowA_mrg); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 3; + vector short rowA = + { AO[l + 0], 0, AO[l + 1], 0, AO[l + 2], 0, AO[l + 3], 0 }; + vec_t *rowB = (vec_t *) & (BO[l << 1]); + MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), (vec_t) rowA); + MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), (vec_t) rowA); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC1 (&acc1, 0); + CO += 4; + AO += (k << 2); + BO += (k << 3); + } + i = (m & 3) >> 1; + /* Loop for m >= 2. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v2sf_t *rowC; + v2sf_t result[8]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + BLASLONG l = 0; + for (l = 0; l < k / 2; l++) + { + vector short rowA = + { AO[(l << 2) + 0], AO[(l << 2) + 2], AO[(l << 2) + 1], + AO[(l << 2) + 3], + 0, 0, 0, 0 + }; + vec_t *rowB = (vec_t *) & (BO[l << 4]); + MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), (vec_t) rowA); + MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), (vec_t) rowA); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 2; + vector short rowA = { AO[l + 0], 0, AO[l + 1], 0, 0, 0, 0, 0 }; + vec_t *rowB = (vec_t *) & (BO[(l << 2)]); + MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), (vec_t) rowA); + MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), (vec_t) rowA); + } + SAVE4x2_ACC (&acc0, 0); + SAVE4x2_ACC1 (&acc1, 0); + CO += 2; + AO += (k << 1); + BO += (k << 3); + } + i = (m & 1) >> 0; + /* Loop for m = 1. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 } + , t1 = + { + 0, 0, 0, 0}; + for (l = 0; l < k; l++) + { + v4sf_t rowA = + { BF16TOF32 (AO[l]), BF16TOF32 (AO[l]), BF16TOF32 (AO[l]), + BF16TOF32 (AO[l]) + }; + v4sf_t rowB = + { BF16TOF32 (BO[l << 3]), BF16TOF32 (BO[(l << 3) + 1]), + BF16TOF32 (BO[(l << 3) + 2]), + BF16TOF32 (BO[(l << 3) + 3]) + }; + v4sf_t rowB1 = + { BF16TOF32 (BO[(l << 3) + 4]), BF16TOF32 (BO[(l << 3) + 5]), + BF16TOF32 (BO[(l << 3) + 6]), + BF16TOF32 (BO[(l << 3) + 7]) + }; + t += rowA * rowB; + t1 += rowA * rowB1; + } + t = t * valpha; + t1 = t1 * valpha; + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; + CO[2 * ldc] += t[2]; + CO[3 * ldc] += t[3]; + CO[4 * ldc] += t1[0]; + CO[5 * ldc] += t1[1]; + CO[6 * ldc] += t1[2]; + CO[7 * ldc] += t1[3]; + CO += 1; + AO += k; + BO += (k << 3); + } + B += k << 3; + } + N = (n & 7) >> 2; + /* Loop for n >= 4. */ + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i, j; + FLOAT *CO; + IFLOAT *AO; + CO = C; + C += ldc << 2; + AO = A; + i = m >> 5; + /* Loop for m >= 32. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + IFLOAT *A1 = AO + (16 * k); + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + SET_ACC_ZERO8 (); + BLASLONG l = 0; + for (l = 0; l < k / 2; l++) + { + vec_t *rowA = (vec_t *) & (AO[l << 5]); + vec_t *rowA1 = (vec_t *) & (A1[l << 5]); + vec_t *rowB = (vec_t *) & (BO[l << 3]); + vec_t rowB_mrg = MERGE_ROW (rowB[0]); + MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], rowA[2])); + MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], rowA[2])); + MMA (&acc2, rowB_mrg, MERGE_HIGH (rowA[1], rowA[3])); + MMA (&acc3, rowB_mrg, MERGE_LOW (rowA[1], rowA[3])); + MMA (&acc4, rowB_mrg, MERGE_HIGH (rowA1[0], rowA1[2])); + MMA (&acc5, rowB_mrg, MERGE_LOW (rowA1[0], rowA1[2])); + MMA (&acc6, rowB_mrg, MERGE_HIGH (rowA1[1], rowA1[3])); + MMA (&acc7, rowB_mrg, MERGE_LOW (rowA1[1], rowA1[3])); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 3; + vec_t *rowA = (vec_t *) & (AO[(l << 2)]); + vec_t *rowA1 = (vec_t *) & (A1[(l << 2)]); + vec_t *rowB = (vec_t *) & (BO[l]); + vec_t rowB_mrg = MERGE_ROW (rowB[0]); + MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], vzero)); + MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], vzero)); + MMA (&acc2, rowB_mrg, MERGE_HIGH (rowA[1], vzero)); + MMA (&acc3, rowB_mrg, MERGE_LOW (rowA[1], vzero)); + MMA (&acc4, rowB_mrg, MERGE_HIGH (rowA1[0], vzero)); + MMA (&acc5, rowB_mrg, MERGE_LOW (rowA1[0], vzero)); + MMA (&acc6, rowB_mrg, MERGE_HIGH (rowA1[1], vzero)); + MMA (&acc7, rowB_mrg, MERGE_LOW (rowA1[1], vzero)); + } + + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc1, 4); + CO += 8; + SAVE_ACC (&acc2, 0); + SAVE_ACC (&acc3, 4); + CO += 8; + SAVE_ACC (&acc4, 0); + SAVE_ACC (&acc5, 4); + CO += 8; + SAVE_ACC (&acc6, 0); + SAVE_ACC (&acc7, 4); + CO += 8; + AO += k << 5; + BO += k << 2; + } + i = (m & 31) >> 4; + /* Loop for m >= 16. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3; + SET_ACC_ZERO4 (); + BLASLONG l = 0; + for (l = 0; l < k / 2; l++) + { + vec_t *rowA = (vec_t *) & (AO[l << 5]); + vec_t *rowB = (vec_t *) & (BO[l << 3]); + vec_t rowB_mrg = MERGE_ROW (rowB[0]); + MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], rowA[2])); + MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], rowA[2])); + MMA (&acc2, rowB_mrg, MERGE_HIGH (rowA[1], rowA[3])); + MMA (&acc3, rowB_mrg, MERGE_LOW (rowA[1], rowA[3])); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 3; + vec_t *rowA = (vec_t *) & (AO[(l << 2)]); + vec_t *rowB = (vec_t *) & (BO[l]); + vec_t rowB_mrg = MERGE_ROW (rowB[0]); + MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], vzero)); + MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], vzero)); + MMA (&acc2, rowB_mrg, MERGE_HIGH (rowA[1], vzero)); + MMA (&acc3, rowB_mrg, MERGE_LOW (rowA[1], vzero)); + } + + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc1, 4); + CO += 8; + SAVE_ACC (&acc2, 0); + SAVE_ACC (&acc3, 4); + CO += 8; + AO += k << 4; + BO += k << 2; + } + i = (m & 15) >> 3; + /* Loop for m >= 8. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + BLASLONG l = 0; + for (l = 0; l < k / 2; l++) + { + vec_t *rowA = (vec_t *) & (AO[l << 4]); + vec_t *rowB = (vec_t *) & (BO[l << 3]); + vec_t rowB_mrg = MERGE_ROW (rowB[0]); + MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], rowA[1])); + MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], rowA[1])); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 3; + vec_t *rowA = (vec_t *) & (AO[l << 1]); + vec_t *rowB = (vec_t *) & (BO[l]); + vec_t rowB_mrg = MERGE_ROW (rowB[0]); + MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], vzero)); + MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], vzero)); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc1, 4); + CO += 8; + AO += k << 3; + BO += k << 2; + } + i = (m & 7) >> 2; + /* Loop for m >= 4. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v4sf_t *rowC; + __vector_quad acc0; + v4sf_t result[4]; + BLASLONG l = 0; + __builtin_mma_xxsetaccz (&acc0); + for (l = 0; l < k / 2; l++) + { + vec_t *rowA = (vec_t *) & (AO[l << 3]); + vec_t *rowB = (vec_t *) & (BO[l << 3]); + MMA (&acc0, MERGE_ROW (rowB[0]), MERGE_ROW (rowA[0])); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 3; + vector short rowA = + { AO[l], 0, AO[l + 1], 0, AO[l + 2], 0, AO[l + 3], 0 }; + vec_t *rowB = (vec_t *) & (BO[l]); + MMA (&acc0, MERGE_ROW (rowB[0]), (vec_t) rowA); + } + SAVE_ACC (&acc0, 0); + CO += 4; + AO += k << 2; + BO += k << 2; + } + i = (m & 3) >> 1; + /* Loop for m >= 2. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v2sf_t *rowC; + v2sf_t result[8]; + __vector_quad acc0; + BLASLONG l = 0; + __builtin_mma_xxsetaccz (&acc0); + for (l = 0; l < k / 2; l++) + { + vector short rowA = + { AO[(l << 2) + 0], AO[(l << 2) + 2], AO[(l << 2) + 1], + AO[(l << 2) + 3], + 0, 0, 0, 0 + }; + vec_t *rowB = (vec_t *) & (BO[l << 3]); + MMA (&acc0, MERGE_ROW (rowB[0]), (vec_t) rowA); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 2; + vector short rowA = { AO[l], 0, AO[l + 1], 0, 0, 0, 0, 0 }; + vec_t *rowB = (vec_t *) & (BO[l << 1]); + MMA (&acc0, MERGE_ROW (rowB[0]), (vec_t) rowA); + } + SAVE4x2_ACC (&acc0, 0); + CO += 2; + AO += k << 1; + BO += k << 2; + } + i = (m & 1) >> 0; + /* Loop for m = 1. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; + for (l = 0; l < k; l++) + { + v4sf_t rowA = + { BF16TOF32 (AO[l]), BF16TOF32 (AO[l]), BF16TOF32 (AO[l]), + BF16TOF32 (AO[l]) + }; + v4sf_t rowB = + { BF16TOF32 (BO[l << 2]), BF16TOF32 (BO[(l << 2) + 1]), + BF16TOF32 (BO[(l << 2) + 2]), + BF16TOF32 (BO[(l << 2) + 3]) + }; + t += rowA * rowB; + } + t = t * valpha; + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; + CO[2 * ldc] += t[2]; + CO[3 * ldc] += t[3]; + AO += k; + BO += (k << 2); + CO += 1; + } + + B += k << 2; + } + N = (n & 3) >> 1; + /* Loop for n >= 2. */ + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i, j; + FLOAT *CO; + IFLOAT *AO; + CO = C; + C += ldc << 1; + AO = A; + i = m >> 5; + /* Loop for m >= 32. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + IFLOAT *A1 = AO + (16 * k); + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + SET_ACC_ZERO8 (); + BLASLONG l = 0; + for (l = 0; l < k / 2; l++) + { + vector short rowB = + { BO[(l << 2) + 0], BO[(l << 2) + 2], BO[(l << 2) + 1], + BO[(l << 2) + 3], + 0, 0, 0, 0 + }; + vec_t *rowA = (vec_t *) & (AO[l << 5]); + vec_t *rowA1 = (vec_t *) & (A1[l << 5]); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2])); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2])); + MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3])); + MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3])); + MMA (&acc4, (vec_t) rowB, MERGE_HIGH (rowA1[0], rowA1[2])); + MMA (&acc5, (vec_t) rowB, MERGE_LOW (rowA1[0], rowA1[2])); + MMA (&acc6, (vec_t) rowB, MERGE_HIGH (rowA1[1], rowA1[3])); + MMA (&acc7, (vec_t) rowB, MERGE_LOW (rowA1[1], rowA1[3])); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 2; + vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 }; + vec_t *rowA = (vec_t *) & (AO[l << 3]); + vec_t *rowA1 = (vec_t *) & (A1[l << 3]); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2])); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2])); + MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3])); + MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3])); + MMA (&acc4, (vec_t) rowB, MERGE_HIGH (rowA1[0], rowA1[2])); + MMA (&acc5, (vec_t) rowB, MERGE_LOW (rowA1[0], rowA1[2])); + MMA (&acc6, (vec_t) rowB, MERGE_HIGH (rowA1[1], rowA1[3])); + MMA (&acc7, (vec_t) rowB, MERGE_LOW (rowA1[1], rowA1[3])); + } + SAVE2x4_ACC (&acc0, 0); + SAVE2x4_ACC (&acc1, 4); + SAVE2x4_ACC (&acc2, 8); + SAVE2x4_ACC (&acc3, 12); + CO += 16; + SAVE2x4_ACC (&acc4, 0); + SAVE2x4_ACC (&acc5, 4); + SAVE2x4_ACC (&acc6, 8); + SAVE2x4_ACC (&acc7, 12); + CO += 16; + AO += k << 5; + BO += k << 1; + } + i = (m & 31) >> 4; + /* Loop for m >= 16. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3; + SET_ACC_ZERO4 (); + BLASLONG l = 0; + for (l = 0; l < k / 2; l++) + { + vector short rowB = + { BO[(l << 2) + 0], BO[(l << 2) + 2], BO[(l << 2) + 1], + BO[(l << 2) + 3], + 0, 0, 0, 0 + }; + vec_t *rowA = (vec_t *) & (AO[l << 5]); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2])); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2])); + MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3])); + MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3])); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 2; + vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 }; + vec_t *rowA = (vec_t *) & (AO[l << 3]); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2])); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2])); + MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3])); + MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3])); + } + SAVE2x4_ACC (&acc0, 0); + SAVE2x4_ACC (&acc1, 4); + SAVE2x4_ACC (&acc2, 8); + SAVE2x4_ACC (&acc3, 12); + CO += 16; + AO += k << 4; + BO += k << 1; + } + i = (m & 15) >> 3; + /* Loop for m >= 8. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + BLASLONG l = 0; + for (l = 0; l < k / 2; l++) + { + vector short rowB = + { BO[(l << 2) + 0], BO[(l << 2) + 2], BO[(l << 2) + 1], + BO[(l << 2) + 3], + 0, 0, 0, 0 + }; + vec_t *rowA = (vec_t *) & (AO[l << 4]); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[1])); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[1])); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 2; + vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 }; + vec_t *rowA = (vec_t *) & (AO[(l << 2)]); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[1])); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[1])); + } + SAVE2x4_ACC (&acc0, 0); + SAVE2x4_ACC (&acc1, 4); + CO += 8; + AO += k << 3; + BO += k << 1; + } + i = (m & 7) >> 2; + /* Loop for m >= 4. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0; + __builtin_mma_xxsetaccz (&acc0); + BLASLONG l = 0; + for (l = 0; l < k / 2; l++) + { + vector short rowB = + { BO[(l << 2) + 0], BO[(l << 2) + 2], BO[(l << 2) + 1], + BO[(l << 2) + 3], + 0, 0, 0, 0 + }; + vec_t *rowA = (vec_t *) & (AO[l << 3]); + MMA (&acc0, (vec_t) rowB, MERGE_ROW (rowA[0])); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 2; + vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 }; + vec_t *rowA = (vec_t *) & (AO[l << 1]); + MMA (&acc0, (vec_t) rowB, MERGE_ROW (rowA[0])); + } + SAVE2x4_ACC (&acc0, 0); + CO += 4; + AO += k << 2; + BO += k << 1; + } + i = (m & 3) >> 1; + /* Loop for m >= 2. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; + for (l = 0; l < (k << 1); l += 2) + { + v4sf_t rowA = + { BF16TOF32 (AO[l]), BF16TOF32 (AO[l]), BF16TOF32 (AO[l + 1]), + BF16TOF32 (AO[l + 1]) + }; + v4sf_t rowB = + { BF16TOF32 (BO[l]), BF16TOF32 (BO[l + 1]), BF16TOF32 (BO[l]), + BF16TOF32 (BO[l + 1]) + }; + t += rowA * rowB; + } + t = t * valpha; + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; + CO[0 * ldc + 1] += t[2]; + CO[1 * ldc + 1] += t[3]; + CO += 2; + AO += k << 1; + BO += k << 1; + } + i = (m & 1) >> 0; + /* Loop for m = 1. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; + for (l = 0; l < k; l++) + { + v4sf_t rowA = { BF16TOF32 (AO[l]), BF16TOF32 (AO[l]), 0, 0 }; + v4sf_t rowB = + { BF16TOF32 (BO[l << 1]), BF16TOF32 (BO[(l << 1) + 1]), 0, + 0 + }; + t += rowA * rowB; + } + CO[0 * ldc] += t[0] * alpha; + CO[1 * ldc] += t[1] * alpha; + CO += 1; + AO += k; + BO += k << 1; + } + B += k << 1; + } + N = (n & 1) >> 0; + /* Loop for n = 1. */ + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i; + FLOAT *CO; + IFLOAT *AO; + CO = C; + C += ldc; + AO = A; + i = m; + /* Loop for m >= 16. */ + while (i >= 16) + { + IFLOAT *BO = B; + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; + v4sf_t t1 = { 0, 0, 0, 0 }; + v4sf_t t2 = { 0, 0, 0, 0 }; + v4sf_t t3 = { 0, 0, 0, 0 }; + for (l = 0; l < k; l++) + { + v4sf_t rowB = + { BF16TOF32 (BO[l]), BF16TOF32 (BO[l]), BF16TOF32 (BO[l]), + BF16TOF32 (BO[l]) + }; + v4sf_t rowA = + { BF16TOF32 (AO[l << 4]), BF16TOF32 (AO[(l << 4) + 1]), + BF16TOF32 (AO[(l << 4) + 2]), + BF16TOF32 (AO[(l << 4) + 3]) + }; + v4sf_t rowA1 = + { BF16TOF32 (AO[(l << 4) + 4]), BF16TOF32 (AO[(l << 4) + 5]), + BF16TOF32 (AO[(l << 4) + 6]), + BF16TOF32 (AO[(l << 4) + 7]) + }; + v4sf_t rowA2 = + { BF16TOF32 (AO[(l << 4) + 8]), BF16TOF32 (AO[(l << 4) + 9]), + BF16TOF32 (AO[(l << 4) + 10]), + BF16TOF32 (AO[(l << 4) + 11]) + }; + v4sf_t rowA3 = { BF16TOF32 (AO[(l << 4) + 12]), + BF16TOF32 (AO[(l << 4) + 13]), BF16TOF32 (AO[(l << 4) + 14]), + BF16TOF32 (AO[(l << 4) + 15]) + }; + t += rowA * rowB; + t1 += rowA1 * rowB; + t2 += rowA2 * rowB; + t3 += rowA3 * rowB; + } + t = t * valpha; + t1 = t1 * valpha; + t2 = t2 * valpha; + t3 = t3 * valpha; + CO[0] += t[0]; + CO[1] += t[1]; + CO[2] += t[2]; + CO[3] += t[3]; + CO[4] += t1[0]; + CO[5] += t1[1]; + CO[6] += t1[2]; + CO[7] += t1[3]; + CO[8] += t2[0]; + CO[9] += t2[1]; + CO[10] += t2[2]; + CO[11] += t2[3]; + CO[12] += t3[0]; + CO[13] += t3[1]; + CO[14] += t3[2]; + CO[15] += t3[3]; + AO += k << 4; + BO += k; + CO += 16; + i -= 16; + } + /* Loop for m >= 8. */ + while (i >= 8) + { + IFLOAT *BO = B; + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; + v4sf_t t1 = { 0, 0, 0, 0 }; + for (l = 0; l < k; l++) + { + v4sf_t rowB = + { BF16TOF32 (BO[l]), BF16TOF32 (BO[l]), BF16TOF32 (BO[l]), + BF16TOF32 (BO[l]) + }; + v4sf_t rowA = + { BF16TOF32 (AO[l << 3]), BF16TOF32 (AO[(l << 3) + 1]), + BF16TOF32 (AO[(l << 3) + 2]), + BF16TOF32 (AO[(l << 3) + 3]) + }; + v4sf_t rowA1 = + { BF16TOF32 (AO[(l << 3) + 4]), BF16TOF32 (AO[(l << 3) + 5]), + BF16TOF32 (AO[(l << 3) + 6]), + BF16TOF32 (AO[(l << 3) + 7]) + }; + t += rowA * rowB; + t1 += rowA1 * rowB; + } + t = t * valpha; + t1 = t1 * valpha; + CO[0] += t[0]; + CO[1] += t[1]; + CO[2] += t[2]; + CO[3] += t[3]; + CO[4] += t1[0]; + CO[5] += t1[1]; + CO[6] += t1[2]; + CO[7] += t1[3]; + AO += k << 3; + BO += k; + CO += 8; + i -= 8; + } + /* Loop for m >= 4. */ + while (i >= 4) + { + IFLOAT *BO = B; + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; + for (l = 0; l < k; l++) + { + v4sf_t rowB = + { BF16TOF32 (BO[l]), BF16TOF32 (BO[l]), BF16TOF32 (BO[l]), + BF16TOF32 (BO[l]) + }; + v4sf_t rowA = + { BF16TOF32 (AO[l << 2]), BF16TOF32 (AO[(l << 2) + 1]), + BF16TOF32 (AO[(l << 2) + 2]), + BF16TOF32 (AO[(l << 2) + 3]) + }; + t += rowA * rowB; + } + t = t * valpha; + CO[0] += t[0]; + CO[1] += t[1]; + CO[2] += t[2]; + CO[3] += t[3]; + AO += k << 2; + BO += k; + CO += 4; + i -= 4; + } + /* Loop for m >= 2. */ + while (i >= 2) + { + IFLOAT *BO = B; + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; + for (l = 0; l < k; l++) + { + v4sf_t rowB = { BF16TOF32 (BO[l]), BF16TOF32 (BO[l]), 0, 0 }; + v4sf_t rowA = + { BF16TOF32 (AO[l << 1]), BF16TOF32 (AO[(l << 1) + 1]), 0, + 0 + }; + t += rowA * rowB; + } + t = t * valpha; + CO[0] += t[0]; + CO[1] += t[1]; + AO += k << 1; + BO += k; + CO += 2; + i -= 2; + } + /* Loop for m = 1. */ + while (i >= 1) + { + IFLOAT *BO = B; + BLASLONG l = 0; + FLOAT t = 0; + for (l = 0; l < k; l++) + { + t += BF16TOF32 (AO[l]) * BF16TOF32 (BO[l]); + } + AO += k; + BO += k; + CO[0] += t * alpha; + CO += 1; + i -= 1; + } + + B += k; + } + + return 0; +} diff --git a/kernel/power/srot.c b/kernel/power/srot.c index 6af813c16..9638a59eb 100644 --- a/kernel/power/srot.c +++ b/kernel/power/srot.c @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma GCC optimize "O1" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "srot_microk_power8.c" #endif diff --git a/kernel/power/sscal.c b/kernel/power/sscal.c index 4f3ba5698..ddd5b2c5b 100644 --- a/kernel/power/sscal.c +++ b/kernel/power/sscal.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "sscal_microk_power8.c" #endif diff --git a/kernel/power/sswap.c b/kernel/power/sswap.c index 23d13280f..a56434444 100644 --- a/kernel/power/sswap.c +++ b/kernel/power/sswap.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "sswap_microk_power8.c" #endif diff --git a/kernel/power/zasum.c b/kernel/power/zasum.c index f61c62e75..8383e39ab 100644 --- a/kernel/power/zasum.c +++ b/kernel/power/zasum.c @@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "zasum_microk_power8.c" #endif diff --git a/kernel/power/zaxpy.c b/kernel/power/zaxpy.c index f0f8c6910..4a7c26c69 100644 --- a/kernel/power/zaxpy.c +++ b/kernel/power/zaxpy.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "zaxpy_microk_power8.c" #endif diff --git a/kernel/power/zcopy.c b/kernel/power/zcopy.c index b21d6ef15..bb80decd2 100644 --- a/kernel/power/zcopy.c +++ b/kernel/power/zcopy.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "zcopy_microk_power8.c" #endif diff --git a/kernel/power/zdot.c b/kernel/power/zdot.c index fd36c7f44..9086ef35b 100644 --- a/kernel/power/zdot.c +++ b/kernel/power/zdot.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "zdot_microk_power8.c" #endif diff --git a/kernel/power/zgemm_kernel_power10.S b/kernel/power/zgemm_kernel_power10.S new file mode 100644 index 000000000..fca389e69 --- /dev/null +++ b/kernel/power/zgemm_kernel_power10.S @@ -0,0 +1,245 @@ +/*************************************************************************** +Copyright (c) 2013-2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#define LOAD ld + +#define STACKSIZE 512 + +#define FZERO 312+192(SP) + +#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ + +#define M r3 +#define N r4 +#define K r5 + + +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 + + + +#define o0 0 +#define alpha_r vs62 +#define alpha_i vs63 + +#define VECSAVE r11 + +#define FRAMEPOINTER r12 + +#define T10 r14 + +#define L r15 +#define T8 r16 +#define T5 r17 +#define T2 r19 +#define TEMP_REG r20 +#define T6 r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define T7 r27 +#define T3 r28 +#define T4 r29 + +#define PRE r30 +#define T1 r31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + mr FRAMEPOINTER, SP + addi SP, SP, -STACKSIZE + mflr r0 + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + xxspltd alpha_r,vs1,0 /*copy from register f1 */ + xxspltd alpha_i,vs2,0 /*copy from register f2 */ + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + + stxv vs20, 288(SP) + stxv vs21, 304(SP) + stxv vs22, 320(SP) + stxv vs23, 336(SP) + stxv vs24, 352(SP) + stxv vs25, 368(SP) + stxv vs26, 384(SP) + stxv vs27, 400(SP) + stxv vs28, 416(SP) + stxv vs29, 432(SP) + stxv vs30, 448(SP) + stxv vs31, 464(SP) + + std r0, FLINK_SAVE(SP) + + +#if defined(linux) || defined(__FreeBSD__) + ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) +#endif + + +#ifdef TRMMKERNEL +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) + ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) +#endif +#endif + + +#include "zgemm_macros_power10.S" + + + + slwi LDC, LDC, ZBASE_SHIFT + li PRE, 512 + li r0, 0 + + +#if defined(CC) || defined(CR) || defined(RC) || defined(RR) +/*negate for this case as we will use addition -1*(a+b) */ + xvnegdp alpha_r,alpha_r + xvnegdp alpha_i,alpha_i +#endif + .align 4 + +#include "zgemm_logic_power10.S" + +L999: + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + ld r0, FLINK_SAVE(SP) + + lxv vs20, 288(SP) + lxv vs21, 304(SP) + lxv vs22, 320(SP) + lxv vs23, 336(SP) + lxv vs24, 352(SP) + lxv vs25, 368(SP) + lxv vs26, 384(SP) + lxv vs27, 400(SP) + mtlr r0 + lxv vs28, 416(SP) + lxv vs29, 432(SP) + lxv vs30, 448(SP) + lxv vs31, 464(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE +#endif diff --git a/kernel/power/zgemm_logic_power10.S b/kernel/power/zgemm_logic_power10.S new file mode 100644 index 000000000..1143733e0 --- /dev/null +++ b/kernel/power/zgemm_logic_power10.S @@ -0,0 +1,1735 @@ +/*************************************************************************** +Copyright (c) 2013-2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define MY_ALIGN .align 3 +b ZGEMM_L2 +/* MINI SUBROUTINES */ +/* 2x8 MAIN 128x+2 LOOP */ + + +ZGEMM_L2x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + MY_ALIGN +ZGEMM_L2x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_2 0, 0 +ZGEMM_L2x8_K128: +/*----------------------------------------*/ + KERNEL2x8_2 1, 0 + dcbt AO, T2 + KERNEL2x8_2 2, 0 + KERNEL2x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_2 4, 0 + KERNEL2x8_2 5, 0 + dcbt AO, T4 + KERNEL2x8_2 6, 0 + KERNEL2x8_2 7, 0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_2 8, 0 + KERNEL2x8_2 9, 0 + KERNEL2x8_2 10, 0 + KERNEL2x8_2 11, 0 + dcbt BO, T4 + KERNEL2x8_2 12, 0 + KERNEL2x8_2 13, 0 + KERNEL2x8_2 14, 0 + KERNEL2x8_2 15, 0 + KERNEL2x8_2 16, 0 + KERNEL2x8_2 17, 0 + KERNEL2x8_2 18, 0 + KERNEL2x8_2 19, 0 + KERNEL2x8_2 20, 0 + KERNEL2x8_2 21, 0 + KERNEL2x8_2 22, 0 + KERNEL2x8_2 23, 0 + KERNEL2x8_2 24, 0 + KERNEL2x8_2 25, 0 + KERNEL2x8_2 26, 0 + KERNEL2x8_2 27, 0 + KERNEL2x8_2 28, 0 + KERNEL2x8_2 29, 0 + KERNEL2x8_2 30, 0 + KERNEL2x8_2 31, 0 + KERNEL2x8_2 32, 0 + KERNEL2x8_2 33, 0 + KERNEL2x8_2 34, 0 + KERNEL2x8_2 35, 0 + KERNEL2x8_2 36, 0 + KERNEL2x8_2 37, 0 + KERNEL2x8_2 38, 0 + KERNEL2x8_2 39, 0 + KERNEL2x8_2 40, 0 + KERNEL2x8_2 41, 0 + KERNEL2x8_2 42, 0 + KERNEL2x8_2 43, 0 + KERNEL2x8_2 44, 0 + KERNEL2x8_2 45, 0 + KERNEL2x8_2 46, 0 + KERNEL2x8_2 47, 0 + KERNEL2x8_2 48, 0 + KERNEL2x8_2 49, 0 + KERNEL2x8_2 50, 0 + KERNEL2x8_2 51, 0 + KERNEL2x8_2 52, 0 + KERNEL2x8_2 53, 0 + KERNEL2x8_2 54, 0 + KERNEL2x8_2 55, 0 + KERNEL2x8_2 56, 0 + KERNEL2x8_2 57, 0 + KERNEL2x8_2 58, 0 + KERNEL2x8_2 59, 0 + KERNEL2x8_2 60, 0 + KERNEL2x8_2 61, 0 + KERNEL2x8_2 62, 0 + KERNEL2x8_2 63, 1 + bdz ZGEMM_L2x8_LOOP_END + b ZGEMM_L2x8_LOOP + MY_ALIGN + +ZGEMM_L2x8_LOOP_END: +/*----------------------------------------*/ + KERNEL2x8_2 0, 1 + blr + MY_ALIGN + + +ZGEMM_2x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + MY_ALIGN +ZGEMM_L2x4_LOOP: +/*----------------------------------------*/ + KERNEL2x4_2 0, 0 +ZGEMM_L2x4_K32: +/*----------------------------------------*/ + KERNEL2x4_2 1, 0 + KERNEL2x4_2 2, 0 + KERNEL2x4_2 3, 0 + KERNEL2x4_2 4, 0 + KERNEL2x4_2 5, 0 + KERNEL2x4_2 6, 0 + KERNEL2x4_2 7, 0 + KERNEL2x4_2 8, 0 + KERNEL2x4_2 9, 0 + KERNEL2x4_2 10, 0 + KERNEL2x4_2 11, 0 + KERNEL2x4_2 12, 0 + KERNEL2x4_2 13, 0 + KERNEL2x4_2 14, 0 + KERNEL2x4_2 15, 1 + bdnz ZGEMM_L2x4_LOOP + MY_ALIGN +ZGEMM_L2x4_LOOP_END: +/*----------------------------------------*/ + KERNEL2x4_2 0, 1 + blr + MY_ALIGN + + +ZGEMM_2x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + MY_ALIGN +ZGEMM_L2x2_LOOP: +/*----------------------------------------*/ + KERNEL2x2_2 0, 0 +ZGEMM_L2x2_K32: +/*----------------------------------------*/ + KERNEL2x2_2 1, 0 + KERNEL2x2_2 2, 0 + KERNEL2x2_2 3, 0 + KERNEL2x2_2 4, 0 + KERNEL2x2_2 5, 0 + KERNEL2x2_2 6, 0 + KERNEL2x2_2 7, 0 + KERNEL2x2_2 8, 0 + KERNEL2x2_2 9, 0 + KERNEL2x2_2 10, 0 + KERNEL2x2_2 11, 0 + KERNEL2x2_2 12, 0 + KERNEL2x2_2 13, 0 + KERNEL2x2_2 14, 0 + KERNEL2x2_2 15, 1 + bdnz ZGEMM_L2x2_LOOP + MY_ALIGN + + +ZGEMM_L2x2_LOOP_END: +/*----------------------------------------*/ + KERNEL2x2_2 0, 1 + blr + MY_ALIGN + +ZGEMM_2x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x1_2 + MY_ALIGN +ZGEMM_L2x1_LOOP: +/*----------------------------------------*/ + KERNEL2x1_L2 32, 64, 0, 0 +ZGEMM_L2x1_K32: +/*----------------------------------------*/ + KERNEL2x1_L2 32, 64, 1, 0 + KERNEL2x1_L2 32, 64, 2, 0 + KERNEL2x1_L2 32, 64, 3, 0 + KERNEL2x1_L2 32, 64, 4, 0 + KERNEL2x1_L2 32, 64, 5, 0 + KERNEL2x1_L2 32, 64, 6, 0 + KERNEL2x1_L2 32, 64, 7, 0 + KERNEL2x1_L2 32, 64, 8, 0 + KERNEL2x1_L2 32, 64, 9, 0 + KERNEL2x1_L2 32, 64, 10, 0 + KERNEL2x1_L2 32, 64, 11, 0 + KERNEL2x1_L2 32, 64, 12, 0 + KERNEL2x1_L2 32, 64, 13, 0 + KERNEL2x1_L2 32, 64, 14, 0 + KERNEL2x1_L2 32, 64, 15, 1 + bdnz ZGEMM_L2x1_LOOP + MY_ALIGN +ZGEMM_L2x1_LOOP_END: +/*----------------------------------------*/ + END2x1_2 + blr + + MY_ALIGN + + +/* MAIN LOOP BEGINS */ + MY_ALIGN + + +ZGEMM_L2: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) && !defined(LEFT) + neg TEMP_REG, OFFSET +#endif + srawi. J, N, 1 + bgt ZGEMM_L2_BEGIN + b ZGEMM_L2_END + +ZGEMM_L2_BEGIN: +/*----------------------------------------*/ + mr CO, C + slwi T1, LDC, 1 + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + bgt ZGEMM_L2_BEGIN_CONTINUE + b ZGEMM_L2x8_END + +ZGEMM_L2_BEGIN_CONTINUE: + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +ZGEMM_L2x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 8, 2 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 8, 2 + mr T1, T6 +#else + mr T1, K +#endif +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /* T8 <- T1 % 128 */ + + KERNEL2x8_PRELOAD + KERNEL2x8_ZERO_AND_PRIME_MMA + ble ZGEMM_L2x8_SUB0 + bl ZGEMM_L2x8_LMAIN_SUB + andi. L, T1, 127 + + bgt ZGEMM_L2x8_BEGIN_CONTINUE + b ZGEMM_L2x8_SAVE + +ZGEMM_L2x8_BEGIN_CONTINUE: + b ZGEMM_L2x8_SUB2 + + +ZGEMM_L2x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6, 129 +#else + andi. L, K, 255 + cmpwi K, 129 +#endif + li T8, 1 + bne CMP2x8_128K + LOAD_END_2x8 128, 32 + KERNEL2x8_PRELOAD + addi BO, BO, -64 + addi AO,AO, -256 + mtctr T8 + bl ZGEMM_L2x8_K128 + b ZGEMM_L2x8_SAVE + +CMP2x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 128 +#else + cmpwi K, 128 +#endif + bne ZGEMM_L2x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -64 + addi AO,AO, -256 + bl ZGEMM_L2x8_K128 + b ZGEMM_L2x8_SAVE + MY_ALIGN + + +ZGEMM_L2x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble ZGEMM_L2x8_SUB2_32 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_2 0, 0 + KERNEL2x8_2 1, 0 + dcbt AO, T2 + KERNEL2x8_2 2, 0 + KERNEL2x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_2 4, 0 + KERNEL2x8_2 5, 0 + dcbt AO, T4 + KERNEL2x8_2 6, 0 + KERNEL2x8_2 7, 0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_2 8, 0 + KERNEL2x8_2 9, 0 + KERNEL2x8_2 10, 0 + KERNEL2x8_2 11, 0 + dcbt BO, T4 + KERNEL2x8_2 12, 0 + KERNEL2x8_2 13, 0 + KERNEL2x8_2 14, 0 + KERNEL2x8_2 15, 0 + KERNEL2x8_2 16, 0 + KERNEL2x8_2 17, 0 + KERNEL2x8_2 18, 0 + KERNEL2x8_2 19, 0 + KERNEL2x8_2 20, 0 + KERNEL2x8_2 21, 0 + KERNEL2x8_2 22, 0 + KERNEL2x8_2 23, 0 + KERNEL2x8_2 24, 0 + KERNEL2x8_2 25, 0 + KERNEL2x8_2 26, 0 + KERNEL2x8_2 27, 0 + KERNEL2x8_2 28, 0 + KERNEL2x8_2 29, 0 + KERNEL2x8_2 30, 0 + KERNEL2x8_2 31, 1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble ZGEMM_L2x8_SUB2_16 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_2 0, 0 + KERNEL2x8_2 1, 0 + dcbt AO, T2 + KERNEL2x8_2 2, 0 + KERNEL2x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_2 4, 0 + KERNEL2x8_2 5, 0 + dcbt AO, T4 + KERNEL2x8_2 6, 0 + KERNEL2x8_2 7, 0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_2 8, 0 + KERNEL2x8_2 9, 0 + KERNEL2x8_2 10, 0 + KERNEL2x8_2 11, 0 + dcbt BO, T4 + KERNEL2x8_2 12, 0 + KERNEL2x8_2 13, 0 + KERNEL2x8_2 14, 0 + KERNEL2x8_2 15, 1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L2x8_SUB2_8 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_2 0, 0 + KERNEL2x8_2 1, 0 + dcbt AO, T2 + KERNEL2x8_2 2, 0 + KERNEL2x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_2 4, 0 + KERNEL2x8_2 5, 0 + dcbt AO, T4 + KERNEL2x8_2 6, 0 + KERNEL2x8_2 7, 1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L2x8_SUB2_4 + KERNEL2x8_2 0, 0 + KERNEL2x8_2 1, 0 + KERNEL2x8_2 2, 0 + KERNEL2x8_2 3, 1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L2x8_SUB2_2 + KERNEL2x8_2 0, 0 + KERNEL2x8_2 1, 1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L2x8_SUB2_1 + KERNEL2x8_2 0, 1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L2x8_SAVE + LOAD_END_2x8 128, 32 + + +ZGEMM_L2x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + KERNEL2x8_UNPRIME_MMA + SAVE2x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 8, 2 +#endif + + ble ZGEMM_L2x8_SAVE_CONTINUE + b ZGEMM_L2x8_BEGIN + +ZGEMM_L2x8_SAVE_CONTINUE: + andi. T2, M, 7 + ble ZGEMM_L2x1_END + andi. T1, M, 4 + ble ZGEMM_L2x4_END + b ZGEMM_L2x4_BEGIN + MY_ALIGN + + +ZGEMM_L2x8_END: +/*----------------------------------------*/ + + +ZGEMM_L2x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble ZGEMM_L2x1_END + andi. T1, M, 4 + ble ZGEMM_L2x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 4, 2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 4, 2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + KERNEL2x4_PRELOAD + KERNEL2x4_ZERO_AND_PRIME_MMA + ble ZGEMM_L2x4_SUB0 + bl ZGEMM_2x4_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L2x4_SAVE + b ZGEMM_L2x4_SUB2 + + +ZGEMM_L2x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6, 33 +#else + andi. L, K, 63 + cmpwi K, 33 +#endif + li T8, 1 + bne CMP2x4_32K + LOAD_END_2x4 64, 32 + KERNEL2x4_PRELOAD + addi BO, BO, -64 + addi AO,AO, -128 + mtctr T8 + bl ZGEMM_L2x4_K32 + b ZGEMM_L2x4_SAVE + CMP2x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 32 +#else + cmpwi K, 32 +#endif + bne ZGEMM_L2x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -64 + addi AO,AO, -128 + bl ZGEMM_L2x4_K32 + b ZGEMM_L2x4_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L2x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L2x4_SUB2_8 + KERNEL2x4_2 0, 0 + KERNEL2x4_2 1, 0 + KERNEL2x4_2 2, 0 + KERNEL2x4_2 3, 0 + KERNEL2x4_2 4, 0 + KERNEL2x4_2 5, 0 + KERNEL2x4_2 6, 0 + KERNEL2x4_2 7, 1 + MY_ALIGN + + +ZGEMM_L2x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L2x4_SUB2_4 + KERNEL2x4_2 0, 0 + KERNEL2x4_2 1, 0 + KERNEL2x4_2 2, 0 + KERNEL2x4_2 3, 1 + MY_ALIGN + + +ZGEMM_L2x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L2x4_SUB2_2 + KERNEL2x4_2 0, 0 + KERNEL2x4_2 1, 1 + MY_ALIGN + + +ZGEMM_L2x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L2x4_SUB2_1 + KERNEL2x4_2 0, 1 + MY_ALIGN + + +ZGEMM_L2x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L2x4_SAVE + LOAD_END_2x4 64, 32 + + +ZGEMM_L2x4_SAVE: +/*----------------------------------------*/ + KERNEL2x4_UNPRIME_MMA + SAVE2x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 4, 2 +#endif + + +ZGEMM_L2x4_END: +/*----------------------------------------*/ + + +ZGEMM_L2x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble ZGEMM_L2x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 2, 2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 2, 2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + KERNEL2x2_PRELOAD + KERNEL2x2_ZERO_AND_PRIME_MMA + ble ZGEMM_L2x2_SUB0 + bl ZGEMM_2x2_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L2x2_SAVE + b ZGEMM_L2x2_SUB2 + + +ZGEMM_L2x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6, 33 +#else + andi. L, K, 63 + cmpwi K, 33 +#endif + li T8, 1 + bne CMP2x2_32K + LOAD_END_2x2 32, 32 + KERNEL2x2_PRELOAD + addi BO, BO, -64 + addi AO,AO, -64 + mtctr T8 + bl ZGEMM_L2x2_K32 + b ZGEMM_L2x2_SAVE + CMP2x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 32 +#else + cmpwi K, 32 +#endif + bne ZGEMM_L2x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -64 + addi AO,AO, -64 + bl ZGEMM_L2x2_K32 + b ZGEMM_L2x2_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L2x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L2x2_SUB2_8 + KERNEL2x2_2 0, 0 + KERNEL2x2_2 1, 0 + KERNEL2x2_2 2, 0 + KERNEL2x2_2 3, 0 + KERNEL2x2_2 4, 0 + KERNEL2x2_2 5, 0 + KERNEL2x2_2 6, 0 + KERNEL2x2_2 7, 1 + MY_ALIGN + + +ZGEMM_L2x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L2x2_SUB2_4 + KERNEL2x2_2 0, 0 + KERNEL2x2_2 1, 0 + KERNEL2x2_2 2, 0 + KERNEL2x2_2 3, 1 + MY_ALIGN + + +ZGEMM_L2x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L2x2_SUB2_2 + KERNEL2x2_2 0, 0 + KERNEL2x2_2 1, 1 + MY_ALIGN + + +ZGEMM_L2x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L2x2_SUB2_1 + KERNEL2x2_2 0, 1 + MY_ALIGN + + +ZGEMM_L2x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L2x2_SAVE + LOAD_END_2x2 32, 32 + + +ZGEMM_L2x2_SAVE: +/*----------------------------------------*/ + KERNEL2x2_UNPRIME_MMA + SAVE2x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 2, 2 +#endif + + +ZGEMM_L2x2_END: +/*----------------------------------------*/ + + +ZGEMM_L2x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble ZGEMM_L2x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 1, 2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 1, 2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x1 + ble ZGEMM_L2x1_SUB0 + bl ZGEMM_2x1_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L2x1_SAVE + b ZGEMM_L2x1_SUB2 + + +ZGEMM_L2x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6, 33 +#else + andi. L, K, 63 + cmpwi K, 33 +#endif + li T8, 1 + bne CMP2x1_32K + addi BO, BO, -32 + addi AO,AO, -16 + LOAD2x1O 16, 32 + END2x1_WITHOUT_ADD + LOAD2x1_2O 32, 64 + mtctr T8 + bl ZGEMM_L2x1_K32 + b ZGEMM_L2x1_SAVE + CMP2x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 32 +#else + cmpwi K, 32 +#endif + bne ZGEMM_L2x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -64 + addi AO,AO, -32 + LOAD2x1_2O 32, 64 + bl ZGEMM_L2x1_K32 + b ZGEMM_L2x1_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L2x1_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L2x1_SUB2_8 + LOAD2x1_2 + KERNEL2x1_L2 32, 64, 0, 0 + KERNEL2x1_L2 32, 64, 1, 0 + KERNEL2x1_L2 32, 64, 2, 0 + KERNEL2x1_L2 32, 64, 3, 0 + KERNEL2x1_L2 32, 64, 4, 0 + KERNEL2x1_L2 32, 64, 5, 0 + KERNEL2x1_L2 32, 64, 6, 0 + KERNEL2x1_E2 32, 64, 7, 1 + MY_ALIGN + + +ZGEMM_L2x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L2x1_SUB2_4 + LOAD2x1_2 + KERNEL2x1_L2 32, 64, 0, 0 + KERNEL2x1_L2 32, 64, 1, 0 + KERNEL2x1_L2 32, 64, 2, 0 + KERNEL2x1_E2 32, 64, 3, 1 + MY_ALIGN + + +ZGEMM_L2x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L2x1_SUB2_2 + LOAD2x1_2 + KERNEL2x1_L2 32, 64, 0, 0 + KERNEL2x1_E2 32, 64, 1, 1 + MY_ALIGN + + +ZGEMM_L2x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L2x1_SUB2_1 + LOAD2x1_2 + KERNEL2x1_E2 32, 64, 0, 1 + MY_ALIGN + + +ZGEMM_L2x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L2x1_SAVE + KERNEL2x1 + + +ZGEMM_L2x1_SAVE: +/*----------------------------------------*/ + SAVE2x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 1, 2 +#endif + + +ZGEMM_L2x1_END: +/*----------------------------------------*/ + slwi T1, K, 5 + addic. J, J, -1 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 2 +#endif + ble ZGEMM_L2_END + b ZGEMM_L2_BEGIN + +ZGEMM_L2_END: + +b ZGEMM_L1 +/* MINI SUBROUTINES */ +/* 1x8 MAIN 128x+2 LOOP */ + + +ZGEMM_L1x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + MY_ALIGN +ZGEMM_L1x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_2 0, 0 +ZGEMM_L1x8_K128: +/*----------------------------------------*/ + KERNEL1x8_2 1, 0 + dcbt AO, T2 + KERNEL1x8_2 2, 0 + KERNEL1x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_2 4, 0 + KERNEL1x8_2 5, 0 + dcbt AO, T4 + KERNEL1x8_2 6, 0 + KERNEL1x8_2 7, 0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_2 8, 0 + KERNEL1x8_2 9, 0 + KERNEL1x8_2 10, 0 + KERNEL1x8_2 11, 0 + dcbt BO, T4 + KERNEL1x8_2 12, 0 + KERNEL1x8_2 13, 0 + KERNEL1x8_2 14, 0 + KERNEL1x8_2 15, 0 + KERNEL1x8_2 16, 0 + KERNEL1x8_2 17, 0 + KERNEL1x8_2 18, 0 + KERNEL1x8_2 19, 0 + KERNEL1x8_2 20, 0 + KERNEL1x8_2 21, 0 + KERNEL1x8_2 22, 0 + KERNEL1x8_2 23, 0 + KERNEL1x8_2 24, 0 + KERNEL1x8_2 25, 0 + KERNEL1x8_2 26, 0 + KERNEL1x8_2 27, 0 + KERNEL1x8_2 28, 0 + KERNEL1x8_2 29, 0 + KERNEL1x8_2 30, 0 + KERNEL1x8_2 31, 0 + KERNEL1x8_2 32, 0 + KERNEL1x8_2 33, 0 + KERNEL1x8_2 34, 0 + KERNEL1x8_2 35, 0 + KERNEL1x8_2 36, 0 + KERNEL1x8_2 37, 0 + KERNEL1x8_2 38, 0 + KERNEL1x8_2 39, 0 + KERNEL1x8_2 40, 0 + KERNEL1x8_2 41, 0 + KERNEL1x8_2 42, 0 + KERNEL1x8_2 43, 0 + KERNEL1x8_2 44, 0 + KERNEL1x8_2 45, 0 + KERNEL1x8_2 46, 0 + KERNEL1x8_2 47, 0 + KERNEL1x8_2 48, 0 + KERNEL1x8_2 49, 0 + KERNEL1x8_2 50, 0 + KERNEL1x8_2 51, 0 + KERNEL1x8_2 52, 0 + KERNEL1x8_2 53, 0 + KERNEL1x8_2 54, 0 + KERNEL1x8_2 55, 0 + KERNEL1x8_2 56, 0 + KERNEL1x8_2 57, 0 + KERNEL1x8_2 58, 0 + KERNEL1x8_2 59, 0 + KERNEL1x8_2 60, 0 + KERNEL1x8_2 61, 0 + KERNEL1x8_2 62, 0 + KERNEL1x8_2 63, 1 + bdnz ZGEMM_L1x8_LOOP + MY_ALIGN +ZGEMM_L1x8_LOOP_END: +/*----------------------------------------*/ + KERNEL1x8_2 0, 1 + blr + MY_ALIGN + + +ZGEMM_1x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + MY_ALIGN + + +ZGEMM_L1x4_LOOP: +/*----------------------------------------*/ + KERNEL1x4_2 0, 0 + + +ZGEMM_L1x4_K32: +/*----------------------------------------*/ + KERNEL1x4_2 1, 0 + KERNEL1x4_2 2, 0 + KERNEL1x4_2 3, 0 + KERNEL1x4_2 4, 0 + KERNEL1x4_2 5, 0 + KERNEL1x4_2 6, 0 + KERNEL1x4_2 7, 0 + KERNEL1x4_2 8, 0 + KERNEL1x4_2 9, 0 + KERNEL1x4_2 10, 0 + KERNEL1x4_2 11, 0 + KERNEL1x4_2 12, 0 + KERNEL1x4_2 13, 0 + KERNEL1x4_2 14, 0 + KERNEL1x4_2 15, 1 + bdnz ZGEMM_L1x4_LOOP + MY_ALIGN + + +ZGEMM_L1x4_LOOP_END: +/*----------------------------------------*/ + KERNEL1x4_2 0, 1 + blr + MY_ALIGN + + +ZGEMM_1x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + MY_ALIGN + + +ZGEMM_L1x2_LOOP: +/*----------------------------------------*/ + KERNEL1x2_2 0, 0 + + +ZGEMM_L1x2_K32: +/*----------------------------------------*/ + KERNEL1x2_2 1, 0 + KERNEL1x2_2 2, 0 + KERNEL1x2_2 3, 0 + KERNEL1x2_2 4, 0 + KERNEL1x2_2 5, 0 + KERNEL1x2_2 6, 0 + KERNEL1x2_2 7, 0 + KERNEL1x2_2 8, 0 + KERNEL1x2_2 9, 0 + KERNEL1x2_2 10, 0 + KERNEL1x2_2 11, 0 + KERNEL1x2_2 12, 0 + KERNEL1x2_2 13, 0 + KERNEL1x2_2 14, 0 + KERNEL1x2_2 15, 1 + bdnz ZGEMM_L1x2_LOOP + MY_ALIGN + + +ZGEMM_L1x2_LOOP_END: +/*----------------------------------------*/ + KERNEL1x2_2 0, 1 + blr + MY_ALIGN + + +ZGEMM_1x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x1_2 + MY_ALIGN + + +ZGEMM_L1x1_LOOP: +/*----------------------------------------*/ + KERNEL1x1_L2 32, 32, 0, 0 + + +ZGEMM_L1x1_K32: +/*----------------------------------------*/ + KERNEL1x1_L2 32, 32, 1, 0 + KERNEL1x1_L2 32, 32, 2, 0 + KERNEL1x1_L2 32, 32, 3, 0 + KERNEL1x1_L2 32, 32, 4, 0 + KERNEL1x1_L2 32, 32, 5, 0 + KERNEL1x1_L2 32, 32, 6, 0 + KERNEL1x1_L2 32, 32, 7, 0 + KERNEL1x1_L2 32, 32, 8, 0 + KERNEL1x1_L2 32, 32, 9, 0 + KERNEL1x1_L2 32, 32, 10, 0 + KERNEL1x1_L2 32, 32, 11, 0 + KERNEL1x1_L2 32, 32, 12, 0 + KERNEL1x1_L2 32, 32, 13, 0 + KERNEL1x1_L2 32, 32, 14, 0 + KERNEL1x1_L2 32, 32, 15, 1 + bdnz ZGEMM_L1x1_LOOP + MY_ALIGN + + +ZGEMM_L1x1_LOOP_END: +/*----------------------------------------*/ + END1x1_2 + blr + MY_ALIGN + + +/*----------------------N1 BEGINS---------*/ +ZGEMM_L1: +/*----------------------------------------*/ + andi. T1, N, 1 + ble ZGEMM_L1_END + +ZGEMM_L1_BEGIN: +/*----------------------------------------*/ + mr CO, C + + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble ZGEMM_L1x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +ZGEMM_L1x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 8, 1 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 8, 1 + mr T1, T6 +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T11-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + KERNEL1x8_ZERO_AND_PRIME_MMA + ble ZGEMM_L1x8_SUB0 + bl ZGEMM_L1x8_LMAIN_SUB + andi. L, T1, 127 + ble ZGEMM_L1x8_SAVE + b ZGEMM_L1x8_SUB2 + + +ZGEMM_L1x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6, 129 +#else + andi. L, K, 255 + cmpwi K, 129 +#endif + li T8, 1 + bne CMP1x8_128K + LOAD_END_1x8 -128, -16 + mtctr T8 + bl ZGEMM_L1x8_K128 + b ZGEMM_L1x8_SAVE + CMP1x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 128 +#else + cmpwi K, 128 +#endif + bne ZGEMM_L1x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -32 + addi AO,AO, -256 + bl ZGEMM_L1x8_K128 + b ZGEMM_L1x8_SAVE + MY_ALIGN + + +ZGEMM_L1x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble ZGEMM_L1x8_SUB2_32 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_2 0, 0 + KERNEL1x8_2 1, 0 + dcbt AO, T2 + KERNEL1x8_2 2, 0 + KERNEL1x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_2 4, 0 + KERNEL1x8_2 5, 0 + dcbt AO, T4 + KERNEL1x8_2 6, 0 + KERNEL1x8_2 7, 0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_2 8, 0 + KERNEL1x8_2 9, 0 + KERNEL1x8_2 10, 0 + KERNEL1x8_2 11, 0 + dcbt BO, T4 + KERNEL1x8_2 12, 0 + KERNEL1x8_2 13, 0 + KERNEL1x8_2 14, 0 + KERNEL1x8_2 15, 0 + KERNEL1x8_2 16, 0 + KERNEL1x8_2 17, 0 + KERNEL1x8_2 18, 0 + KERNEL1x8_2 19, 0 + KERNEL1x8_2 20, 0 + KERNEL1x8_2 21, 0 + KERNEL1x8_2 22, 0 + KERNEL1x8_2 23, 0 + KERNEL1x8_2 24, 0 + KERNEL1x8_2 25, 0 + KERNEL1x8_2 26, 0 + KERNEL1x8_2 27, 0 + KERNEL1x8_2 28, 0 + KERNEL1x8_2 29, 0 + KERNEL1x8_2 30, 0 + KERNEL1x8_2 31, 1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble ZGEMM_L1x8_SUB2_16 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_2 0, 0 + KERNEL1x8_2 1, 0 + dcbt AO, T2 + KERNEL1x8_2 2, 0 + KERNEL1x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_2 4, 0 + KERNEL1x8_2 5, 0 + dcbt AO, T4 + KERNEL1x8_2 6, 0 + KERNEL1x8_2 7, 0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_2 8, 0 + KERNEL1x8_2 9, 0 + KERNEL1x8_2 10, 0 + KERNEL1x8_2 11, 0 + dcbt BO, T4 + KERNEL1x8_2 12, 0 + KERNEL1x8_2 13, 0 + KERNEL1x8_2 14, 0 + KERNEL1x8_2 15, 1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L1x8_SUB2_8 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_2 0, 0 + KERNEL1x8_2 1, 0 + dcbt AO, T2 + KERNEL1x8_2 2, 0 + KERNEL1x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_2 4, 0 + KERNEL1x8_2 5, 0 + dcbt AO, T4 + KERNEL1x8_2 6, 0 + KERNEL1x8_2 7, 1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L1x8_SUB2_4 + KERNEL1x8_2 0, 0 + KERNEL1x8_2 1, 0 + KERNEL1x8_2 2, 0 + KERNEL1x8_2 3, 1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L1x8_SUB2_2 + KERNEL1x8_2 0, 0 + KERNEL1x8_2 1, 1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L1x8_SUB2_1 + KERNEL1x8_2 0, 1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L1x8_SAVE + LOAD_END_1x8 128, 16 + + +ZGEMM_L1x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + KERNEL1x8_UNPRIME_MMA + SAVE1x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 8, 1 +#endif + bgt ZGEMM_L1x8_BEGIN + andi. T2, M, 7 + ble ZGEMM_L1x1_END + andi. T1, M, 4 + ble ZGEMM_L1x4_END + b ZGEMM_L1x4_BEGIN + MY_ALIGN + + +ZGEMM_L1x8_END: +/*----------------------------------------*/ + + +ZGEMM_L1x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble ZGEMM_L1x1_END + andi. T1, M, 4 + ble ZGEMM_L1x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 4, 1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 4, 1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + KERNEL1x4_ZERO_AND_PRIME_MMA + ble ZGEMM_L1x4_SUB0 + bl ZGEMM_1x4_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L1x4_SAVE + b ZGEMM_L1x4_SUB2 + + +ZGEMM_L1x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6, 33 +#else + andi. L, K, 63 + cmpwi K, 33 +#endif + li T8, 1 + bne CMP1x4_32K + LOAD_END_1x4 -64, -16 + mtctr T8 + bl ZGEMM_L1x4_K32 + b ZGEMM_L1x4_SAVE + CMP1x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 32 +#else + cmpwi K, 32 +#endif + bne ZGEMM_L1x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -32 + addi AO,AO, -128 + bl ZGEMM_L1x4_K32 + b ZGEMM_L1x4_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L1x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L1x4_SUB2_8 + KERNEL1x4_2 0, 0 + KERNEL1x4_2 1, 0 + KERNEL1x4_2 2, 0 + KERNEL1x4_2 3, 0 + KERNEL1x4_2 4, 0 + KERNEL1x4_2 5, 0 + KERNEL1x4_2 6, 0 + KERNEL1x4_2 7, 1 + MY_ALIGN + + +ZGEMM_L1x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L1x4_SUB2_4 + KERNEL1x4_2 0, 0 + KERNEL1x4_2 1, 0 + KERNEL1x4_2 2, 0 + KERNEL1x4_2 3, 1 + MY_ALIGN + + +ZGEMM_L1x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L1x4_SUB2_2 + KERNEL1x4_2 0, 0 + KERNEL1x4_2 1, 1 + MY_ALIGN + + +ZGEMM_L1x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L1x4_SUB2_1 + KERNEL1x4_2 0, 1 + MY_ALIGN + + +ZGEMM_L1x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L1x4_SAVE + LOAD_END_1x4 64,16 + + + +ZGEMM_L1x4_SAVE: +/*----------------------------------------*/ + KERNEL1x4_UNPRIME_MMA + SAVE1x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 4, 1 +#endif + + +ZGEMM_L1x4_END: +/*----------------------------------------*/ + + +ZGEMM_L1x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble ZGEMM_L1x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 2, 1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 2, 1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + KERNEL1x2_ZERO_AND_PRIME_MMA + ble ZGEMM_L1x2_SUB0 + bl ZGEMM_1x2_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L1x2_SAVE + b ZGEMM_L1x2_SUB2 + + +ZGEMM_L1x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6, 33 +#else + andi. L, K, 63 + cmpwi K, 33 +#endif + li T8, 1 + bne CMP1x2_32K + LOAD_END_1x2 -32, -16 + mtctr T8 + bl ZGEMM_L1x2_K32 + b ZGEMM_L1x2_SAVE + CMP1x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 32 +#else + cmpwi K, 32 +#endif + bne ZGEMM_L1x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -32 + addi AO,AO, -64 + bl ZGEMM_L1x2_K32 + b ZGEMM_L1x2_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L1x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L1x2_SUB2_8 + KERNEL1x2_2 0, 0 + KERNEL1x2_2 1, 0 + KERNEL1x2_2 2, 0 + KERNEL1x2_2 3, 0 + KERNEL1x2_2 4, 0 + KERNEL1x2_2 5, 0 + KERNEL1x2_2 6, 0 + KERNEL1x2_2 7, 1 + MY_ALIGN + + +ZGEMM_L1x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L1x2_SUB2_4 + KERNEL1x2_2 0, 0 + KERNEL1x2_2 1, 0 + KERNEL1x2_2 2, 0 + KERNEL1x2_2 3, 1 + MY_ALIGN + + +ZGEMM_L1x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L1x2_SUB2_2 + KERNEL1x2_2 0, 0 + KERNEL1x2_2 1, 1 + MY_ALIGN + + +ZGEMM_L1x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L1x2_SUB2_1 + KERNEL1x2_2 0, 1 + MY_ALIGN + + +ZGEMM_L1x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L1x2_SAVE + LOAD_END_1x2 32,16 + + +ZGEMM_L1x2_SAVE: +/*----------------------------------------*/ + KERNEL1x2_UNPRIME_MMA + SAVE1x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 2, 1 +#endif + + +ZGEMM_L1x2_END: +/*----------------------------------------*/ + + +ZGEMM_L1x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble ZGEMM_L1x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 1, 1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 1, 1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO1x1 + ble ZGEMM_L1x1_SUB0 + bl ZGEMM_1x1_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L1x1_SAVE + b ZGEMM_L1x1_SUB2 + + +ZGEMM_L1x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6, 33 +#else + andi. L, K, 63 + cmpwi K, 33 +#endif + li T8, 1 + bne CMP1x1_32K + addi BO, BO, -16 + addi AO,AO, -16 + LOAD1x1O 16, 16 + END1x1_WITHOUT_ADD + LOAD1x1_2O 32, 32 + mtctr T8 + bl ZGEMM_L1x1_K32 + b ZGEMM_L1x1_SAVE + CMP1x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 32 +#else + cmpwi K, 32 +#endif + bne ZGEMM_L1x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -32 + addi AO,AO, -32 + LOAD1x1_2O 32, 32 + bl ZGEMM_L1x1_K32 + b ZGEMM_L1x1_SAVE + MY_ALIGN + + +ZGEMM_L1x1_SUB2: +/*----------------------------------------*/ + andi. T1, L, 16 + ble ZGEMM_L1x1_SUB2_8 + LOAD1x1_2 + KERNEL1x1_L2 32, 32, 0, 0 + KERNEL1x1_L2 32, 32, 1, 0 + KERNEL1x1_L2 32, 32, 2, 0 + KERNEL1x1_L2 32, 32, 3, 0 + KERNEL1x1_L2 32, 32, 4, 0 + KERNEL1x1_L2 32, 32, 5, 0 + KERNEL1x1_L2 32, 32, 6, 0 + KERNEL1x1_E2 32, 32, 7, 1 + MY_ALIGN + + +ZGEMM_L1x1_SUB2_8: +/*----------------------------------------*/ + andi. T1, L, 8 + ble ZGEMM_L1x1_SUB2_4 + LOAD1x1_2 + KERNEL1x1_L2 32, 32, 0, 0 + KERNEL1x1_L2 32, 32, 1, 0 + KERNEL1x1_L2 32, 32, 2, 0 + KERNEL1x1_E2 32, 32, 3, 1 + MY_ALIGN + + +ZGEMM_L1x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L1x1_SUB2_2 + LOAD1x1_2 + KERNEL1x1_L2 32, 32, 0, 0 + KERNEL1x1_E2 32, 32, 1, 1 + MY_ALIGN + + +ZGEMM_L1x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L1x1_SUB2_1 + LOAD1x1_2 + KERNEL1x1_E2 32, 32, 0, 1 + MY_ALIGN + + +ZGEMM_L1x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L1x1_SAVE + KERNEL1x1 + + +ZGEMM_L1x1_SAVE: +/*----------------------------------------*/ + SAVE1x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 1, 1 +#endif + + +ZGEMM_L1x1_END: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 1 +#endif + + +ZGEMM_L1_END: +/*----------------------------------------*/ diff --git a/kernel/power/zgemm_macros_power10.S b/kernel/power/zgemm_macros_power10.S new file mode 100644 index 000000000..42f9c5ad4 --- /dev/null +++ b/kernel/power/zgemm_macros_power10.S @@ -0,0 +1,1138 @@ +/*************************************************************************** +Copyright (c) 2013-2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define unit_size 16 +#define DISP32(ind,disp) (ind*unit_size*32+disp) +#define DISP16(ind,disp) (ind*unit_size*16+disp) +#define DISP8(ind,disp) (ind*unit_size*8+disp) +#define DISP4(ind,disp) (ind*unit_size*4+disp) +#define DISP2(ind,disp) (ind*unit_size*2+disp) +#define DISP1(ind,disp) (ind*unit_size+disp) +#define DISPX(disp) (disp) +/* HELPERS FOR SAVE */ +/* {r0,i0} and {r1,i1} into {r0,r1} {i0,i1} */ + + +.macro LOAD_COUPLE_AS_RR_II VS_OUT1,VS_OUT2,VS_TEMP1,VS_TEMP2,REG,LOFFSET +#ifndef TRMMKERNEL + lxv \VS_TEMP1, DISPX(\LOFFSET)(\REG) + lxv \VS_TEMP2, DISPX(\LOFFSET+16)(\REG) + xxmrgld \VS_OUT1,\VS_TEMP1,\VS_TEMP2 + xxmrghd \VS_OUT2,\VS_TEMP1,\VS_TEMP2 +#endif +.endm +/*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/ + + +.macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2 + xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/ + xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/ +.endm +/*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/ + + +.macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2 + xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*imag */ + xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/ +.endm +/* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/ + + +.macro AGGREGATE_REALS_IMAGES VSINR_OUT1,VSINR,VSINI_OUT2,VSINI +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + xvsubdp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubdp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) + xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubdp \VSINI_OUT2,\VSINI,\VSINI_OUT2 +#else // CC || CR || RC || RR + /*we will assume {-alpha_r,-alpha_i} for this case */ + /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ + xvsubdp \VSINR_OUT1,\VSINR,\VSINR_OUT1 + /*we will negate alpha image instead instead to fix sign*/ + xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#endif +.endm +/* {i0,i1} * {alpha_i,alpha_i} - VSOUT1 ;VSOUT2 + {r0,r1}*{alpha_i,alpha_i} */ + + +.macro MULT_APLHA_PART1 VSINRR,VSINII,VSOUT1,VSOUT2 +#ifndef TRMMKERNEL + xvmsubadp \VSOUT1,\VSINII, alpha_i + xvmaddadp \VSOUT2,\VSINRR, alpha_i +#else + xvmuldp \VSOUT1,\VSINII, alpha_i + xvmuldp \VSOUT2,\VSINRR, alpha_i +#endif +.endm +/* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */ + + +.macro MULT_APLHA_PART2 VSINRR,VSINII,VSOUT1,VSOUT2 + xvmsubadp \VSOUT1,\VSINRR, alpha_r + xvmaddadp \VSOUT2,\VSINII, alpha_r +.endm +/* unpack to store 2{r,r} {i,i} into {r,i} {r,i} (big endian because of stxv) */ + + +.macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2 + xxmrghd \VSOUT1,\VSIN2,\VSIN1 + xxmrgld \VSOUT2,\VSIN2,\VSIN1 +.endm + + +.macro STORE_COUPLE REG,LOFFSET,VSIN1,VSIN2 + stxv \VSIN1, DISPX(\LOFFSET)(\REG) + stxv \VSIN2, DISPX(\LOFFSET+16)(\REG) +.endm + + +.macro SAVE8 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,VSRes9,VSRes10,VSRes11,VSRes12,VSRes13,VSRes14,VSRes15,VSRes16,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs34,vs35 + LOAD_COUPLE_AS_RR_II vs46,vs47,vs50,vs51,\BASE_REG,\LOFFSET + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs36,vs37 + LOAD_COUPLE_AS_RR_II vs48,vs49,vs52,vs53,\BASE_REG,(\LOFFSET+32) + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs38,vs39 + LOAD_COUPLE_AS_RR_II vs56,vs57,vs50,vs51,\BASE_REG,(\LOFFSET +64) + RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs40,vs41 + LOAD_COUPLE_AS_RR_II vs58,vs59,vs52,vs53,\BASE_REG,(\LOFFSET+96) + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes9,\VSRes11,vs42,vs43 + AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37 + RESULT_INTO_REALIMAG_IMAGREAL \VSRes10,\VSRes12,vs44,vs45 + AGGREGATE_REALS_IMAGES vs38,vs39,vs40,vs41 + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes13,\VSRes15,\VSRes1,\VSRes2 + MULT_APLHA_PART1 vs34,vs36, vs46,vs47 + RESULT_INTO_REALIMAG_IMAGREAL \VSRes14,\VSRes16,\VSRes3,\VSRes4 + MULT_APLHA_PART1 vs38,vs40,vs48,vs49 + MULT_APLHA_PART2 vs34,vs36,vs46,vs47 + AGGREGATE_REALS_IMAGES vs42,vs43,vs44,vs45 + MULT_APLHA_PART2 vs38,vs40,vs48,vs49 + AGGREGATE_REALS_IMAGES \VSRes1,\VSRes2,\VSRes3,\VSRes4 + UNPACK_FOR_STORE vs46,vs47,vs39,vs41 + MULT_APLHA_PART1 vs42,vs44, vs56,vs57 + UNPACK_FOR_STORE vs48,vs49,vs35,vs37 + MULT_APLHA_PART1 \VSRes1,\VSRes3, vs58,vs59 + STORE_COUPLE \BASE_REG,\LOFFSET,vs39,vs41 + MULT_APLHA_PART2 vs42,vs44,vs56,vs57 + STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs35,vs37 + MULT_APLHA_PART2 \VSRes1,\VSRes3, vs58,vs59 + UNPACK_FOR_STORE vs56,vs57,vs42,vs44 + UNPACK_FOR_STORE vs58,vs59,\VSRes1,\VSRes3 + STORE_COUPLE \BASE_REG,(\LOFFSET +64),vs42,vs44 + STORE_COUPLE \BASE_REG,(\LOFFSET+96),\VSRes1,\VSRes3 +.endm + + +.macro SAVE4 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs34,vs35 + LOAD_COUPLE_AS_RR_II vs46,vs47,vs50,vs51,\BASE_REG,\LOFFSET + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs36,vs37 + LOAD_COUPLE_AS_RR_II vs48,vs49,vs52,vs53,\BASE_REG,(\LOFFSET+32) + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs38,vs39 + RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs40,vs41 + AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37 + AGGREGATE_REALS_IMAGES vs38,vs39,vs40,vs41 + MULT_APLHA_PART1 vs34,vs36, vs46,vs47 + MULT_APLHA_PART1 vs38,vs40, vs48,vs49 + MULT_APLHA_PART2 vs34,vs36, vs46,vs47 + MULT_APLHA_PART2 vs38,vs40,vs48,vs49 + UNPACK_FOR_STORE vs46,vs47,vs39,vs41 + UNPACK_FOR_STORE vs48,vs49,vs35,vs37 + STORE_COUPLE \BASE_REG,\LOFFSET,vs39,vs41 + STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs35,vs37 +.endm + + +.macro SAVE2 VSRes1,VSRes2,VSRes3,VSRes4,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs34,vs35 + LOAD_COUPLE_AS_RR_II vs46,vs47,vs50,vs51,\BASE_REG,\LOFFSET + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs36,vs37 + AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37 + MULT_APLHA_PART1 vs34,vs36, vs46,vs47 + MULT_APLHA_PART2 vs34,vs36, vs46,vs47 + UNPACK_FOR_STORE vs46,vs47,vs39,vs41 + STORE_COUPLE \BASE_REG,\LOFFSET,vs39,vs41 +.endm + + +.macro SAVE1 VSRes1,VSRes2,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs34,vs35 +#ifndef TRMMKERNEL + lxv vs50, (\LOFFSET)(\BASE_REG) + xxmrgld vs46,vs50,vs50 + xxmrghd vs47,vs50,vs50 +#endif + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes2,vs36,vs37 + AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37 + MULT_APLHA_PART1 vs34,vs36, vs46,vs47 + MULT_APLHA_PART2 vs34,vs36, vs46,vs47 + UNPACK_FOR_STORE vs46,vs47,vs39,vs41 + xxmrghd vs39,vs47,vs46 + stxv vs39, (\LOFFSET)(\BASE_REG) +.endm + +/********************************************************************************************** +* + +.macros for N=2 and M=8 +**********************************************************************************************/ + +.macro KERNEL2x8_ZERO_AND_PRIME_MMA + /* zero out and prime the MMA accumulators */ + xxsetaccz 0 + xxsetaccz 1 + xxsetaccz 2 + xxsetaccz 3 + xxsetaccz 4 + xxsetaccz 5 + xxsetaccz 6 + xxsetaccz 7 +.endm + + +.macro KERNEL2x8_PRELOAD + lxvp vs32, 0(AO) // load real,imag from A + lxvp vs34, 32(AO) // load real,imag from A + lxvp vs36, 64(AO) // load real,imag from A + lxvp vs38, 96(AO) // load real,imag from A + lxvp vs48, 0(BO) // load real imag from B +.endm + + +.macro KERNEL2x8_2 Index, IsLast + lxvp vs40, DISP16(\Index,128)(AO) // load real,imag from A + lxvp vs42, DISP16(\Index,160)(AO) // load real,imag from A + lxvp vs44, DISP16(\Index,192)(AO) // load real,imag from A + lxvp vs46, DISP16(\Index,224)(AO) // load real,imag from A + lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 2, vs36, vs49 + xvf64gerpp 3, vs38, vs49 + xvf64gerpp 4, vs32, vs48 + xvf64gerpp 5, vs34, vs48 + xvf64gerpp 6, vs36, vs48 + xvf64gerpp 7, vs38, vs48 + lxvp vs32, DISP16(\Index, 256)(AO) // load real,imag from A + lxvp vs34, DISP16(\Index, 288)(AO) // load real,imag from A + lxvp vs36, DISP16(\Index, 320)(AO) // load real,imag from A + lxvp vs38, DISP16(\Index, 352)(AO) // load real,imag from A + lxvp vs48, DISP4(\Index, 64)(BO) // load real imag from B + xvf64gerpp 0, vs40, vs51 + xvf64gerpp 1, vs42, vs51 + xvf64gerpp 2, vs44, vs51 + xvf64gerpp 3, vs46, vs51 + xvf64gerpp 4, vs40, vs50 + xvf64gerpp 5, vs42, vs50 + xvf64gerpp 6, vs44, vs50 + xvf64gerpp 7, vs46, vs50 +.if \IsLast==1 + addi AO, AO, DISP16(\Index,256) + addi BO, BO, DISP4(\Index,64) +.endif +.endm + + +.macro LOAD_END_2x8 OffsetA,OffsetB + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 2, vs36, vs49 + xvf64gerpp 3, vs38, vs49 + xvf64gerpp 4, vs32, vs48 + xvf64gerpp 5, vs34, vs48 + xvf64gerpp 6, vs36, vs48 + xvf64gerpp 7, vs38, vs48 + addi BO, BO, \OffsetB + addi AO, AO, \OffsetA +.endm + + +.macro KERNEL2x8_UNPRIME_MMA + /* "unprime" MMA accumulators */ + xxmfacc 0 + xxmfacc 1 + xxmfacc 2 + xxmfacc 3 + xxmfacc 4 + xxmfacc 5 + xxmfacc 6 + xxmfacc 7 +.endm + + +.macro SAVE2x8 + add T1, CO ,LDC + xxpermdi vs32, vs0, vs1, 0b01 + xxpermdi vs33, vs0, vs1, 0b10 + xxpermdi vs34, vs2, vs3, 0b01 + xxpermdi vs35, vs2, vs3, 0b10 + xxpermdi vs36, vs4, vs5, 0b01 + xxpermdi vs37, vs4, vs5, 0b10 + xxpermdi vs38, vs6, vs7, 0b01 + xxpermdi vs39, vs6, vs7, 0b10 + xxpermdi vs40, vs8, vs9, 0b01 + xxpermdi vs41, vs8, vs9, 0b10 + xxpermdi vs42, vs10, vs11, 0b01 + xxpermdi vs43, vs10, vs11, 0b10 + xxpermdi vs44, vs12, vs13, 0b01 + xxpermdi vs45, vs12, vs13, 0b10 + xxpermdi vs46, vs14, vs15, 0b01 + xxpermdi vs47, vs14, vs15, 0b10 + + xxlor vs2, vs32, vs32 + xxlor vs3, vs33, vs33 + xxlor vs0, vs34, vs34 + xxlor vs1, vs35, vs35 + xxlor vs6, vs36, vs36 + xxlor vs7, vs37, vs37 + xxlor vs4, vs38, vs38 + xxlor vs5, vs39, vs39 + xxlor vs10, vs40, vs40 + xxlor vs11, vs41, vs41 + xxlor vs8, vs42, vs42 + xxlor vs9, vs43, vs43 + xxlor vs14, vs44, vs44 + xxlor vs15, vs45, vs45 + xxlor vs12, vs46, vs46 + xxlor vs13, vs47, vs47 + + xxpermdi vs32, vs16, vs17, 0b01 + xxpermdi vs33, vs16, vs17, 0b10 + xxpermdi vs34, vs18, vs19, 0b01 + xxpermdi vs35, vs18, vs19, 0b10 + xxpermdi vs36, vs20, vs21, 0b01 + xxpermdi vs37, vs20, vs21, 0b10 + xxpermdi vs38, vs22, vs23, 0b01 + xxpermdi vs39, vs22, vs23, 0b10 + xxpermdi vs40, vs24, vs25, 0b01 + xxpermdi vs41, vs24, vs25, 0b10 + xxpermdi vs42, vs26, vs27, 0b01 + xxpermdi vs43, vs26, vs27, 0b10 + xxpermdi vs44, vs28, vs29, 0b01 + xxpermdi vs45, vs28, vs29, 0b10 + xxpermdi vs46, vs30, vs31, 0b01 + xxpermdi vs47, vs30, vs31, 0b10 + + xxlor vs18, vs32, vs32 + xxlor vs19, vs33, vs33 + xxlor vs16, vs34, vs34 + xxlor vs17, vs35, vs35 + xxlor vs22, vs36, vs36 + xxlor vs23, vs37, vs37 + xxlor vs20, vs38, vs38 + xxlor vs21, vs39, vs39 + xxlor vs26, vs40, vs40 + xxlor vs27, vs41, vs41 + xxlor vs24, vs42, vs42 + xxlor vs25, vs43, vs43 + xxlor vs30, vs44, vs44 + xxlor vs31, vs45, vs45 + xxlor vs28, vs46, vs46 + xxlor vs29, vs47, vs47 + + SAVE8 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0 + SAVE8 vs16,vs17,vs18,vs19,vs20,vs21,vs22,vs23,vs24,vs25,vs26,vs27,vs28,vs29,vs30,vs31,T1,0 + addi CO, CO, 128 +.endm + +/********************************************************************************************** +* + +.macros for N=2 and M=4 +**********************************************************************************************/ + +.macro KERNEL2x4_ZERO_AND_PRIME_MMA + /* zero out and prime the MMA accumulators */ + xxsetaccz 0 + xxsetaccz 1 + xxsetaccz 2 + xxsetaccz 3 +.endm + + +.macro KERNEL2x4_PRELOAD + lxvp vs32, 0(AO) // load real,imag from A + lxvp vs34, 32(AO) // load real,imag from A + lxvp vs48, 0(BO) // load real imag from B +.endm + + +.macro KERNEL2x4_2 Index, IsLast + lxvp vs40, DISP8(\Index, 64)(AO) // load real,imag from A + lxvp vs42, DISP8(\Index, 96)(AO) // load real,imag from A + lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 2, vs32, vs48 + xvf64gerpp 3, vs34, vs48 + lxvp vs32, DISP8(\Index, 128)(AO) // load real,imag from A + lxvp vs34, DISP8(\Index, 160)(AO) // load real,imag from A + lxvp vs48, DISP4(\Index, 64)(BO) // load real,imag from B + xvf64gerpp 0, vs40, vs51 + xvf64gerpp 1, vs42, vs51 + xvf64gerpp 2, vs40, vs50 + xvf64gerpp 3, vs42, vs50 +.if \IsLast==1 + addi AO, AO, DISP8(\Index,128) + addi BO, BO, DISP4(\Index,64) +.endif +.endm + + +.macro LOAD_END_2x4 OffsetA, OffsetB + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 2, vs32, vs48 + xvf64gerpp 3, vs34, vs48 + addi BO, BO, \OffsetB + addi AO, AO, \OffsetA +.endm + + +.macro KERNEL2x4_UNPRIME_MMA + /* "unprime" MMA accumulators */ + xxmfacc 0 + xxmfacc 1 + xxmfacc 2 + xxmfacc 3 +.endm + + +.macro SAVE2x4 + add T1, CO ,LDC + xxpermdi vs32, vs0, vs1, 0b01 + xxpermdi vs33, vs0, vs1, 0b10 + xxpermdi vs34, vs2, vs3, 0b01 + xxpermdi vs35, vs2, vs3, 0b10 + xxpermdi vs36, vs4, vs5, 0b01 + xxpermdi vs37, vs4, vs5, 0b10 + xxpermdi vs38, vs6, vs7, 0b01 + xxpermdi vs39, vs6, vs7, 0b10 + xxpermdi vs40, vs8, vs9, 0b01 + xxpermdi vs41, vs8, vs9, 0b10 + xxpermdi vs42, vs10, vs11, 0b01 + xxpermdi vs43, vs10, vs11, 0b10 + xxpermdi vs44, vs12, vs13, 0b01 + xxpermdi vs45, vs12, vs13, 0b10 + xxpermdi vs46, vs14, vs15, 0b01 + xxpermdi vs47, vs14, vs15, 0b10 + + xxlor vs2, vs32, vs32 + xxlor vs3, vs33, vs33 + xxlor vs0, vs34, vs34 + xxlor vs1, vs35, vs35 + xxlor vs6, vs36, vs36 + xxlor vs7, vs37, vs37 + xxlor vs4, vs38, vs38 + xxlor vs5, vs39, vs39 + xxlor vs10, vs40, vs40 + xxlor vs11, vs41, vs41 + xxlor vs8, vs42, vs42 + xxlor vs9, vs43, vs43 + xxlor vs14, vs44, vs44 + xxlor vs15, vs45, vs45 + xxlor vs12, vs46, vs46 + xxlor vs13, vs47, vs47 + + SAVE4 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0 + SAVE4 vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,T1,0 + addi CO, CO, 64 +.endm + +/********************************************************************************************** +* + +.macros for N=2 and M=2 +**********************************************************************************************/ + +.macro KERNEL2x2_ZERO_AND_PRIME_MMA + /* zero out and prime the MMA accumulators */ + xxsetaccz 0 + xxsetaccz 1 +.endm + + +.macro KERNEL2x2_PRELOAD + lxvp vs32, 0(AO) // load real,imag from A + lxvp vs48, 0(BO) // load real imag from B +.endm + + +.macro KERNEL2x2_2 Index, IsLast + lxvp vs40, DISP4(\Index, 32)(AO) // load real,imag from A + lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs32, vs48 + lxvp vs32, DISP4(\Index, 64)(AO) // load real,imag from A + lxvp vs48, DISP4(\Index, 64)(BO) // load real imag from B + xvf64gerpp 0, vs40, vs51 + xvf64gerpp 1, vs40, vs50 +.if \IsLast==1 + addi AO, AO, DISP4(\Index,64) + addi BO, BO, DISP4(\Index,64) +.endif +.endm + + +.macro LOAD_END_2x2 OffsetA,OffsetB + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs32, vs48 + addi BO, BO, \OffsetB + addi AO, AO, \OffsetA +.endm + + +.macro KERNEL2x2_UNPRIME_MMA + /* "unprime" MMA accumulators */ + xxmfacc 0 + xxmfacc 1 +.endm + + +.macro SAVE2x2 + add T1, CO ,LDC + xxpermdi vs32, vs0, vs1, 0b01 + xxpermdi vs33, vs0, vs1, 0b10 + xxpermdi vs34, vs2, vs3, 0b01 + xxpermdi vs35, vs2, vs3, 0b10 + xxpermdi vs36, vs4, vs5, 0b01 + xxpermdi vs37, vs4, vs5, 0b10 + xxpermdi vs38, vs6, vs7, 0b01 + xxpermdi vs39, vs6, vs7, 0b10 + + xxlor vs2, vs32, vs32 + xxlor vs3, vs33, vs33 + xxlor vs0, vs34, vs34 + xxlor vs1, vs35, vs35 + xxlor vs6, vs36, vs36 + xxlor vs7, vs37, vs37 + xxlor vs4, vs38, vs38 + xxlor vs5, vs39, vs39 + + SAVE2 vs0,vs1,vs2,vs3,CO,0 + SAVE2 vs4,vs5,vs6,vs7,T1,0 + addi CO, CO, 32 +.endm + +/********************************************************************************************** +* + +.macros for N=2 and M=1 +**********************************************************************************************/ + +.macro ZERO2x1 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2, vs2 + xxlxor vs3, vs3, vs3 + +.endm + + +.macro LOAD2x1 + LOAD2x1O 0,0 +.endm + + +.macro LOAD2x1O OffsetA,OffsetB + lxv vs48,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs50, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs49, vs48 + xxswapd vs51, vs50 + lxv vs32, (0+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END2x1_WITHOUT_ADD + END2x1 AO,BO,0,0 +.endm + + +.macro END2x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddadp vs0, vs32, vs48 + xvmaddadp vs2, vs32, vs50 + xvmaddadp vs1, vs32, vs49 + xvmaddadp vs3, vs32, vs51 +.endm + + +.macro LOAD2x1_2 + LOAD2x1_2O 0,0 +.endm + + +.macro LOAD2x1_2O OffsetA,OffsetB + lxv vs48,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs50, (\OffsetB+16)(BO) // load real,imag from B + lxv vs52, (\OffsetB+32)(BO) // load real,imag from B + lxv vs54, (\OffsetB+48)(BO) // load real,imag from B + xxswapd vs49, vs48 + xxswapd vs51, vs50 + lxv vs32, (0+\OffsetA)(AO) // load real,imag from A + lxv vs40, (16+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END2x1_2 + /*for load2 offset will be 32 and 64*/ + KERNEL2x1_2 AO,BO, 32,64,0 ,1,1 +.endm + + +.macro KERNEL2x1_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xxswapd vs53, vs52 + xxswapd vs55, vs54 + xvmaddadp vs0, vs32, vs48 + xvmaddadp vs2, vs32, vs50 + xvmaddadp vs1, vs32, vs49 + xvmaddadp vs3, vs32, vs51 +.if \Complete==0 + lxv vs32, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs48, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B + lxv vs50, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \Complete==0 + xxswapd vs49, vs48 + xxswapd vs51, vs50 +.endif + xvmaddadp vs0, vs40, vs52 + xvmaddadp vs2, vs40, vs54 + xvmaddadp vs1, vs40, vs53 + xvmaddadp vs3, vs40, vs55 +.if \Complete==0 + lxv vs40, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs52, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B + lxv vs54, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index,\OffsetA) + addi \BREG, \BREG, DISP4(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index,32) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif +.endm + + +.macro KERNEL2x1 + LOAD2x1 + END2x1 AO, BO, 16,32 +.endm + + +.macro SAVE2x1 + add T1, CO ,LDC + SAVE1 vs0,vs1,CO,0 + SAVE1 vs2,vs3,T1,0 + addi CO, CO, 16 +.endm + +/********************************************************************************************** +* + +.macros for N=1 and M=8 +**********************************************************************************************/ + +.macro KERNEL1x8_ZERO_AND_PRIME_MMA + /* zero out and prime the MMA accumulators */ + xxsetaccz 0 + xxsetaccz 1 + xxsetaccz 2 + xxsetaccz 3 +.endm + + +.macro KERNEL1x8_2 Index,IsLast + lxvp vs32, DISP16(\Index, 0)(AO) // load real,imag from A + lxvp vs34, DISP16(\Index, 32)(AO) // load real,imag from A + lxvp vs36, DISP16(\Index, 64)(AO) // load real,imag from A + lxvp vs38, DISP16(\Index, 96)(AO) // load real,imag from A + lxvp vs40, DISP16(\Index, 128)(AO) // load real,imag from A + lxvp vs42, DISP16(\Index, 160)(AO) // load real,imag from A + lxvp vs44, DISP16(\Index, 192)(AO) // load real,imag from A + lxvp vs46, DISP16(\Index, 224)(AO) // load real,imag from A + lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 2, vs36, vs49 + xvf64gerpp 3, vs38, vs49 + xvf64gerpp 0, vs40, vs48 + xvf64gerpp 1, vs42, vs48 + xvf64gerpp 2, vs44, vs48 + xvf64gerpp 3, vs46, vs48 +.if \IsLast==1 + addi AO, AO, DISP16(\Index,256) + addi BO, BO, DISP2(\Index,32) +.endif +.endm + + +.macro LOAD_END_1x8 OffsetA,OffsetB + lxvp vs32, 0(AO) // load real,imag from A + lxvp vs34, 32(AO) // load real,imag from A + lxvp vs36, 64(AO) // load real,imag from A + lxvp vs38, 96(AO) // load real,imag from A + lxv vs48, 0(BO) // load real imag from B + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 1, vs34, vs48 + xvf64gerpp 2, vs36, vs48 + xvf64gerpp 3, vs38, vs48 + addi BO, BO, \OffsetB + addi AO, AO, \OffsetA +.endm + + +.macro KERNEL1x8_UNPRIME_MMA + /* "unprime" MMA accumulators */ + xxmfacc 0 + xxmfacc 1 + xxmfacc 2 + xxmfacc 3 +.endm + + +.macro SAVE1x8 + xxpermdi vs32, vs0, vs1, 0b01 + xxpermdi vs33, vs0, vs1, 0b10 + xxpermdi vs34, vs2, vs3, 0b01 + xxpermdi vs35, vs2, vs3, 0b10 + xxpermdi vs36, vs4, vs5, 0b01 + xxpermdi vs37, vs4, vs5, 0b10 + xxpermdi vs38, vs6, vs7, 0b01 + xxpermdi vs39, vs6, vs7, 0b10 + xxpermdi vs40, vs8, vs9, 0b01 + xxpermdi vs41, vs8, vs9, 0b10 + xxpermdi vs42, vs10, vs11, 0b01 + xxpermdi vs43, vs10, vs11, 0b10 + xxpermdi vs44, vs12, vs13, 0b01 + xxpermdi vs45, vs12, vs13, 0b10 + xxpermdi vs46, vs14, vs15, 0b01 + xxpermdi vs47, vs14, vs15, 0b10 + + xxlor vs2, vs32, vs32 + xxlor vs3, vs33, vs33 + xxlor vs0, vs34, vs34 + xxlor vs1, vs35, vs35 + xxlor vs6, vs36, vs36 + xxlor vs7, vs37, vs37 + xxlor vs4, vs38, vs38 + xxlor vs5, vs39, vs39 + xxlor vs10, vs40, vs40 + xxlor vs11, vs41, vs41 + xxlor vs8, vs42, vs42 + xxlor vs9, vs43, vs43 + xxlor vs14, vs44, vs44 + xxlor vs15, vs45, vs45 + xxlor vs12, vs46, vs46 + xxlor vs13, vs47, vs47 + + SAVE8 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0 + addi CO, CO, 128 +.endm + +/********************************************************************************************** +* + +.macros for N=1 and M=4 +**********************************************************************************************/ + +.macro KERNEL1x4_ZERO_AND_PRIME_MMA + /* zero out and prime the MMA accumulators */ + xxsetaccz 0 + xxsetaccz 1 +.endm + + +.macro KERNEL1x4_2 Index,IsLast + lxvp vs32, DISP8(\Index, 0)(AO) // load real,imag from A + lxvp vs34, DISP8(\Index, 32)(AO) // load real,imag from A + lxvp vs40, DISP8(\Index, 64)(AO) // load real,imag from A + lxvp vs42, DISP8(\Index, 96)(AO) // load real,imag from A + lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 0, vs40, vs48 + xvf64gerpp 1, vs42, vs48 +.if \IsLast==1 + addi AO, AO, DISP8(\Index,128) + addi BO, BO, DISP2(\Index,32) +.endif +.endm + + +.macro LOAD_END_1x4 OffsetA,OffsetB + lxvp vs32, 0(AO) // load real,imag from A + lxvp vs34, 32(AO) // load real,imag from A + lxv vs48, 0(BO) // load real imag from B + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 1, vs34, vs48 + addi BO, BO, \OffsetB + addi AO, AO, \OffsetA +.endm + + +.macro KERNEL1x4_UNPRIME_MMA + /* "unprime" MMA accumulators */ + xxmfacc 0 + xxmfacc 1 +.endm + + +.macro SAVE1x4 + xxpermdi vs32, vs0, vs1, 0b01 + xxpermdi vs33, vs0, vs1, 0b10 + xxpermdi vs34, vs2, vs3, 0b01 + xxpermdi vs35, vs2, vs3, 0b10 + xxpermdi vs36, vs4, vs5, 0b01 + xxpermdi vs37, vs4, vs5, 0b10 + xxpermdi vs38, vs6, vs7, 0b01 + xxpermdi vs39, vs6, vs7, 0b10 + + xxlor vs2, vs32, vs32 + xxlor vs3, vs33, vs33 + xxlor vs0, vs34, vs34 + xxlor vs1, vs35, vs35 + xxlor vs6, vs36, vs36 + xxlor vs7, vs37, vs37 + xxlor vs4, vs38, vs38 + xxlor vs5, vs39, vs39 + + SAVE4 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0 + addi CO, CO, 64 +.endm + +/********************************************************************************************** +* + +.macros for N=1 and M=2 +**********************************************************************************************/ + +.macro KERNEL1x2_ZERO_AND_PRIME_MMA + /* zero out and prime the MMA accumulators */ + xxsetaccz 0 +.endm + + +.macro KERNEL1x2_2 Index,IsLast + lxvp vs32, DISP4(\Index, 0)(AO) // load real,imag from A + lxvp vs40, DISP4(\Index, 32)(AO) // load real,imag from A + lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 0, vs40, vs48 +.if \IsLast==1 + addi AO, AO, DISP4(\Index,64) + addi BO, BO, DISP2(\Index,32) +.endif +.endm + + +.macro LOAD_END_1x2 OffsetA,OffsetB + lxvp vs32, 0(AO) // load real,imag from A + lxv vs48, 0(BO) // load real imag from B + xvf64gerpp 0, vs32, vs48 + addi BO, BO, \OffsetB + addi AO, AO, \OffsetA +.endm + + +.macro KERNEL1x2_UNPRIME_MMA + /* "unprime" MMA accumulators */ + xxmfacc 0 +.endm + + +.macro SAVE1x2 + xxpermdi vs32, vs0, vs1, 0b01 + xxpermdi vs33, vs0, vs1, 0b10 + xxpermdi vs34, vs2, vs3, 0b01 + xxpermdi vs35, vs2, vs3, 0b10 + + xxlor vs2, vs32, vs32 + xxlor vs3, vs33, vs33 + xxlor vs0, vs34, vs34 + xxlor vs1, vs35, vs35 + + SAVE2 vs0,vs1,vs2,vs3,CO,0 + addi CO, CO, 32 +.endm + +/********************************************************************************************** +* + +.macros for N=1 and M=1 +**********************************************************************************************/ + +.macro ZERO1x1 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +.endm + + +.macro LOAD1x1 + LOAD1x1O 0,0 +.endm + + +.macro LOAD1x1O OffsetA,OffsetB + lxv vs48,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs32, (0+\OffsetA)(AO) // load real,imag from A + xxswapd vs49, vs48 + +.endm + + +.macro END1x1_WITHOUT_ADD + END1x1 AO,BO,0,0 +.endm + + +.macro END1x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddadp vs0, vs32, vs48 + xvmaddadp vs1, vs32, vs49 +.endm + + +.macro LOAD1x1_2 + LOAD1x1_2O 0,0 +.endm + + +.macro LOAD1x1_2O OffsetA,OffsetB + lxv vs48,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs52, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs49, vs48 + + lxv vs32, (0+\OffsetA)(AO) // load real,imag from A + lxv vs40, (16+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END1x1_2 + /*for load2 offset will be 32 and 32*/ + KERNEL1x1_2 AO,BO, 32,32,0 ,1,1 +.endm + + + +.macro KERNEL1x1_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL1x1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xxswapd vs53, vs52 + xvmaddadp vs0, vs32, vs48 + xvmaddadp vs1, vs32, vs49 +.if \Complete==0 + lxv vs32, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs48, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B +.endif +.if \Complete==0 + xxswapd vs49, vs48 +.endif + xvmaddadp vs0, vs40, vs52 + xvmaddadp vs1, vs40, vs53 +.if \Complete==0 + lxv vs40, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs52, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index,\OffsetA) + addi \BREG, \BREG, DISP2(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index,32) + addi \BREG, \BREG, DISP2(\Index,32) +.endif +.endif +.endm + + + +.macro KERNEL1x1 + LOAD1x1 + END1x1 AO, BO, 16,16 +.endm + + + +.macro SAVE1x1 + SAVE1 vs0,vs1,CO,0 + addi CO, CO, 16 +.endm + +/****************************TRMM POINTER REFRESH + +.macroSES*************************/ + + +.macro SHIFT_REG REG1,REG2,SHIFT_VAL + .if \SHIFT_VAL==16 + slwi \REG1, \REG2, 8 + .elseif \SHIFT_VAL==8 + slwi \REG1, \REG2, 7 + .elseif \SHIFT_VAL==4 + slwi \REG1, \REG2, 6 + .elseif \SHIFT_VAL==2 + slwi \REG1, \REG2, 5 + .elseif \SHIFT_VAL==1 + slwi \REG1, \REG2, 4 + .endif +.endm +/* +//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// ptrbb = bb; +// #else +// ptrba += off*16; +// ptrbb = bb + off*2; +// #endif +*/ + + +.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B + #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /* ptrbb = bb;*/ + mr \PTR_B,\B_VAL /* refresh BPOINT */ + #else + /* + // ptrba =ptrba+ off*C_A; + // ptrbb = bb + off*C_B; + */ + SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ + SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ + add \PTR_B, \B_VAL , T4 /* Add values to BO */ + add \PTR_A, \PTR_A, T2 /* Add values to AO */ + #endif +.endm + +/* +// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +// temp = bk-off; +// #elif defined(LEFT) +// temp = off+16; // number of values in A +// #else +// temp = off+2; // number of values in B +// #endif +*/ + + +.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + /* temp = bk-off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + #elif defined(LEFT) + /* temp = off+INCR_A; // number of values in A */ + addi \TEMP_BK, \OFF_VAL, \INCR_A + #else + /* temp = off+INCR_B // number of values in B*/ + addi \TEMP_BK,\OFF_VAL, \INCR_B + #endif +.endm +/* +// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// temp = bk - off; +// #ifdef LEFT +// temp -= 16; // number of values in A +// #else +// temp -= 2; // number of values in B +// #endif +// ptrba += temp*16; +// ptrbb += temp*2; +// #endif +// #ifdef LEFT +// off += 16; // number of values in A +// #endif +*/ + + + +.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B + #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /*temp = bk - off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + #ifdef LEFT + /*temp -= 8; // number of values in A*/ + addi \TEMP_BK,\TEMP_BK,-\C_A + #else + /*temp -= 4; // number of values in B*/ + addi \TEMP_BK,\TEMP_BK,-\C_B + #endif + /*ptrba += temp*C_A; + ptrbb += temp*C_B;*/ + SHIFT_REG T4,\TEMP_BK,\C_A + SHIFT_REG T2,\TEMP_BK,\C_B + add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ + add \PTR_B, \PTR_B,T2 + #endif + #ifdef LEFT + /*off += 8; // number of values in A*/ + addi \OFF_VAL,\OFF_VAL,\C_A + #endif +.endm + diff --git a/kernel/power/zscal.c b/kernel/power/zscal.c index a1b441d2c..16b584bca 100644 --- a/kernel/power/zscal.c +++ b/kernel/power/zscal.c @@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma GCC optimize "O1" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(DOUBLE) #include "zscal_microk_power8.c" #endif diff --git a/kernel/power/zswap.c b/kernel/power/zswap.c index 1d8826f41..c6508f032 100644 --- a/kernel/power/zswap.c +++ b/kernel/power/zswap.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "zswap_microk_power8.c" #endif diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 3c71c778e..d3aa030c1 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -53,6 +53,65 @@ gotoblas_t TABLE_NAME = { GEMM_DEFAULT_OFFSET_A, GEMM_DEFAULT_OFFSET_B, GEMM_DEFAULT_ALIGN, +#ifdef BUILD_HALF + 0, 0, 0, + SHGEMM_DEFAULT_UNROLL_M, SHGEMM_DEFAULT_UNROLL_N, +#ifdef SHGEMM_DEFAULT_UNROLL_MN + SHGEMM_DEFAULT_UNROLL_MN, +#else + MAX(SHGEMM_DEFAULT_UNROLL_M, SHGEMM_DEFAULT_UNROLL_N), +#endif + + samax_kTS, samin_kTS, smax_kTS, smin_kTS, + isamax_kTS, isamin_kTS, ismax_kTS, ismin_kTS, + snrm2_kTS, sasum_kTS, ssum_kTS, scopy_kTS, sdot_kTS, + dsdot_kTS, + srot_kTS, saxpy_kTS, sscal_kTS, sswap_kTS, + sgemv_nTS, sgemv_tTS, sger_kTS, + ssymv_LTS, ssymv_UTS, + + shgemm_kernelTS, shgemm_betaTS, +#if SHGEMM_DEFAULT_UNROLL_M != SHGEMM_DEFAULT_UNROLL_N + shgemm_incopyTS, shgemm_itcopyTS, +#else + shgemm_oncopyTS, shgemm_otcopyTS, +#endif + shgemm_oncopyTS, shgemm_otcopyTS, + + strsm_kernel_LNTS, strsm_kernel_LTTS, strsm_kernel_RNTS, strsm_kernel_RTTS, +#if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N + strsm_iunucopyTS, strsm_iunncopyTS, strsm_iutucopyTS, strsm_iutncopyTS, + strsm_ilnucopyTS, strsm_ilnncopyTS, strsm_iltucopyTS, strsm_iltncopyTS, +#else + strsm_ounucopyTS, strsm_ounncopyTS, strsm_outucopyTS, strsm_outncopyTS, + strsm_olnucopyTS, strsm_olnncopyTS, strsm_oltucopyTS, strsm_oltncopyTS, +#endif + strsm_ounucopyTS, strsm_ounncopyTS, strsm_outucopyTS, strsm_outncopyTS, + strsm_olnucopyTS, strsm_olnncopyTS, strsm_oltucopyTS, strsm_oltncopyTS, + strmm_kernel_RNTS, strmm_kernel_RTTS, strmm_kernel_LNTS, strmm_kernel_LTTS, +#if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N + strmm_iunucopyTS, strmm_iunncopyTS, strmm_iutucopyTS, strmm_iutncopyTS, + strmm_ilnucopyTS, strmm_ilnncopyTS, strmm_iltucopyTS, strmm_iltncopyTS, +#else + strmm_ounucopyTS, strmm_ounncopyTS, strmm_outucopyTS, strmm_outncopyTS, + strmm_olnucopyTS, strmm_olnncopyTS, strmm_oltucopyTS, strmm_oltncopyTS, +#endif + strmm_ounucopyTS, strmm_ounncopyTS, strmm_outucopyTS, strmm_outncopyTS, + strmm_olnucopyTS, strmm_olnncopyTS, strmm_oltucopyTS, strmm_oltncopyTS, +#if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N + ssymm_iutcopyTS, ssymm_iltcopyTS, +#else + ssymm_outcopyTS, ssymm_oltcopyTS, +#endif + ssymm_outcopyTS, ssymm_oltcopyTS, + +#ifndef NO_LAPACK + sneg_tcopyTS, slaswp_ncopyTS, +#else + NULL,NULL, +#endif +#endif + 0, 0, 0, SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N, #ifdef SGEMM_DEFAULT_UNROLL_MN @@ -648,16 +707,25 @@ gotoblas_t TABLE_NAME = { #if defined(ARCH_ARM64) static void init_parameter(void) { +#if defined(BUILD_HALF) + TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P; +#endif TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#if defined(BUILD_HALF) + TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; +#endif TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; +#if defined(BUILD_HALF) + TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; +#endif TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R; @@ -721,17 +789,26 @@ static void init_parameter(void) { #if defined(ARCH_POWER) static void init_parameter(void) { +#ifdef BUILD_HALF + TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P; +#endif TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#ifdef BUILD_HALF + TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; +#endif TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R; TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; +#ifdef BUILD_HALF + TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; +#endif TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; @@ -741,17 +818,26 @@ static void init_parameter(void) { #if defined(ARCH_ZARCH) static void init_parameter(void) { +#ifdef BUILD_HALF + TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P; +#endif TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#ifdef BUILD_HALF + TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; +#endif TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R; TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; +#ifdef BUILD_HALF + TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; +#endif TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; @@ -891,6 +977,11 @@ static void init_parameter(void) { (void) l2; /* dirty trick to suppress unused variable warning for targets */ /* where the GEMM unrolling parameters do not depend on l2 */ +#ifdef BUILD_HALF + TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P; + TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; + TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; +#endif TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 65f031d03..9b8b84c30 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -14,7 +14,7 @@ STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c DGEMMKERNEL = dgemm_kernel_16x2_skylakex.c DTRMMKERNEL = dgemm_kernel_16x2_skylakex.c DGEMMINCOPY = ../generic/gemm_ncopy_16.c -DGEMMITCOPY = ../generic/gemm_tcopy_16.c +DGEMMITCOPY = dgemm_tcopy_16_skylakex.c DGEMMONCOPY = ../generic/gemm_ncopy_2.c DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c @@ -24,3 +24,6 @@ DGEMM_BETA = dgemm_beta_skylakex.c CGEMMKERNEL = cgemm_kernel_8x2_skylakex.c ZGEMMKERNEL = zgemm_kernel_4x2_skylakex.c + +CSCALKERNEL = ../arm/zscal.c +ZSCALKERNEL = ../arm/zscal.c diff --git a/kernel/x86_64/dgemm_kernel_16x2_skylakex.c b/kernel/x86_64/dgemm_kernel_16x2_skylakex.c index 416ace59b..9f2bf24e2 100644 --- a/kernel/x86_64/dgemm_kernel_16x2_skylakex.c +++ b/kernel/x86_64/dgemm_kernel_16x2_skylakex.c @@ -54,40 +54,40 @@ #define kernel_kstart_n10(mdim,updk) "" #define kernel_kstart_n12(mdim,updk) "" #define kernel_kend_n4(mdim) "xorq %3,%3;"\ - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8)\ - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0)\ + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16) #define kernel_kend_n6(mdim) "xorq %3,%3;"\ - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8) acc_kend_nc3_k1m##mdim(0,8)\ - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) acc_kend_nc3_k1m##mdim(16,24)\ - loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32,40)\ - loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48,56) + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0) acc_kend_nc3_k1m##mdim(0)\ + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16) acc_kend_nc3_k1m##mdim(16)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48) #define kernel_kend_n8(mdim) "xorq %3,%3;"\ - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8) acc_kend_nc3_k1m##mdim(0,8) acc_kend_nc4_k1m##mdim(0,8)\ - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) acc_kend_nc3_k1m##mdim(16,24) acc_kend_nc4_k1m##mdim(16,24)\ - loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32,40) acc_kend_nc4_k1m##mdim(32,40)\ - loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48,56) acc_kend_nc4_k1m##mdim(48,56)\ - loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64,72)\ - loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80,88) + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0) acc_kend_nc3_k1m##mdim(0) acc_kend_nc4_k1m##mdim(0)\ + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16) acc_kend_nc3_k1m##mdim(16) acc_kend_nc4_k1m##mdim(16)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32) acc_kend_nc4_k1m##mdim(32)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48) acc_kend_nc4_k1m##mdim(48)\ + loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64)\ + loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80) #define kernel_kend_n10(mdim) "xorq %3,%3;"\ - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8) acc_kend_nc3_k1m##mdim(0,8) acc_kend_nc4_k1m##mdim(0,8) acc_kend_nc5_k1m##mdim(0,8)\ - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) acc_kend_nc3_k1m##mdim(16,24) acc_kend_nc4_k1m##mdim(16,24) acc_kend_nc5_k1m##mdim(16,24)\ - loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32,40) acc_kend_nc4_k1m##mdim(32,40) acc_kend_nc5_k1m##mdim(32,40)\ - loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48,56) acc_kend_nc4_k1m##mdim(48,56) acc_kend_nc5_k1m##mdim(48,56)\ - loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64,72) acc_kend_nc5_k1m##mdim(64,72)\ - loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80,88) acc_kend_nc5_k1m##mdim(80,88)\ - loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(96,104)\ - loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(112,120) + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0) acc_kend_nc3_k1m##mdim(0) acc_kend_nc4_k1m##mdim(0) acc_kend_nc5_k1m##mdim(0)\ + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16) acc_kend_nc3_k1m##mdim(16) acc_kend_nc4_k1m##mdim(16) acc_kend_nc5_k1m##mdim(16)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32) acc_kend_nc4_k1m##mdim(32) acc_kend_nc5_k1m##mdim(32)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48) acc_kend_nc4_k1m##mdim(48) acc_kend_nc5_k1m##mdim(48)\ + loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64) acc_kend_nc5_k1m##mdim(64)\ + loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80) acc_kend_nc5_k1m##mdim(80)\ + loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(96)\ + loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(112) #define kernel_kend_n12(mdim) "xorq %3,%3;"\ - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8) acc_kend_nc3_k1m##mdim(0,8) acc_kend_nc4_k1m##mdim(0,8) acc_kend_nc5_k1m##mdim(0,8) acc_kend_nc6_k1m##mdim(0,8)\ - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) acc_kend_nc3_k1m##mdim(16,24) acc_kend_nc4_k1m##mdim(16,24) acc_kend_nc5_k1m##mdim(16,24) acc_kend_nc6_k1m##mdim(16,24)\ - loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32,40) acc_kend_nc4_k1m##mdim(32,40) acc_kend_nc5_k1m##mdim(32,40) acc_kend_nc6_k1m##mdim(32,40)\ - loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48,56) acc_kend_nc4_k1m##mdim(48,56) acc_kend_nc5_k1m##mdim(48,56) acc_kend_nc6_k1m##mdim(48,56)\ - loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64,72) acc_kend_nc5_k1m##mdim(64,72) acc_kend_nc6_k1m##mdim(64,72)\ - loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80,88) acc_kend_nc5_k1m##mdim(80,88) acc_kend_nc6_k1m##mdim(80,88)\ - loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(96,104) acc_kend_nc6_k1m##mdim(96,104)\ - loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(112,120) acc_kend_nc6_k1m##mdim(112,120)\ - loada_kend_k1m##mdim acc_kend_nc6_k1m##mdim(128,136)\ - loada_kend_k1m##mdim acc_kend_nc6_k1m##mdim(144,152) + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0) acc_kend_nc3_k1m##mdim(0) acc_kend_nc4_k1m##mdim(0) acc_kend_nc5_k1m##mdim(0) acc_kend_nc6_k1m##mdim(0)\ + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16) acc_kend_nc3_k1m##mdim(16) acc_kend_nc4_k1m##mdim(16) acc_kend_nc5_k1m##mdim(16) acc_kend_nc6_k1m##mdim(16)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32) acc_kend_nc4_k1m##mdim(32) acc_kend_nc5_k1m##mdim(32) acc_kend_nc6_k1m##mdim(32)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48) acc_kend_nc4_k1m##mdim(48) acc_kend_nc5_k1m##mdim(48) acc_kend_nc6_k1m##mdim(48)\ + loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64) acc_kend_nc5_k1m##mdim(64) acc_kend_nc6_k1m##mdim(64)\ + loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80) acc_kend_nc5_k1m##mdim(80) acc_kend_nc6_k1m##mdim(80)\ + loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(96) acc_kend_nc6_k1m##mdim(96)\ + loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(112) acc_kend_nc6_k1m##mdim(112)\ + loada_kend_k1m##mdim acc_kend_nc6_k1m##mdim(128)\ + loada_kend_k1m##mdim acc_kend_nc6_k1m##mdim(144) #endif #else #define HEAD_SET_OFF(ndim) {} @@ -129,18 +129,28 @@ #define init_update_k(mdim) "" #define save_update_k(mdim) "" #endif - + #define KERNEL_h_k1m16n1 \ "vmovupd (%0),%%zmm1; vmovupd 64(%0),%%zmm2; addq $128,%0;"\ "vbroadcastsd (%1),%%zmm3; vfmadd231pd %%zmm1,%%zmm3,%%zmm8; vfmadd231pd %%zmm2,%%zmm3,%%zmm9;" #define KERNEL_k1m16n1 KERNEL_h_k1m16n1 "addq $8,%1;" -#define KERNEL_h_k1m16n2 KERNEL_h_k1m16n1\ +#ifdef BROADCAST_KERNEL + #define KERNEL_h_k1m16n2 KERNEL_h_k1m16n1\ "vbroadcastsd 8(%1),%%zmm4; vfmadd231pd %%zmm1,%%zmm4,%%zmm10; vfmadd231pd %%zmm2,%%zmm4,%%zmm11;" -#define KERNEL_k1m16n2 KERNEL_h_k1m16n2 "addq $16,%1;" -#define unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,boff1,boff2,...)\ + #define unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,boff1,...)\ "vbroadcastsd "#boff1"("#__VA_ARGS__"),%%zmm3; vfmadd231pd %%zmm1,%%zmm3,%%zmm"#c1_no"; vfmadd231pd %%zmm2,%%zmm3,%%zmm"#c2_no";"\ - "vbroadcastsd "#boff2"("#__VA_ARGS__"),%%zmm4; vfmadd231pd %%zmm1,%%zmm4,%%zmm"#c3_no"; vfmadd231pd %%zmm2,%%zmm4,%%zmm"#c4_no";" -#define unit_acc_m16n2(c1_no,c2_no,c3_no,c4_no,...) unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,0,8,__VA_ARGS__) + "vbroadcastsd "#boff1"+8("#__VA_ARGS__"),%%zmm4; vfmadd231pd %%zmm1,%%zmm4,%%zmm"#c3_no"; vfmadd231pd %%zmm2,%%zmm4,%%zmm"#c4_no";" + #define unit_acc_m16n2(c1_no,c2_no,c3_no,c4_no,...) unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,0,__VA_ARGS__) +#else + #define unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,boff1,...)\ + "vbroadcastf32x4 "#boff1"("#__VA_ARGS__"),%%zmm5; vfmadd231pd %%zmm1,%%zmm5,%%zmm"#c1_no"; vfmadd231pd %%zmm2,%%zmm5,%%zmm"#c2_no";"\ + "vfmadd231pd %%zmm3,%%zmm5,%%zmm"#c3_no"; vfmadd231pd %%zmm4,%%zmm5,%%zmm"#c4_no";" + #define unit_acc_m16n2(c1_no,c2_no,c3_no,c4_no,...) unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,0,__VA_ARGS__) + #define KERNEL_h_k1m16n2 \ + "vmovddup (%0),%%zmm1; vmovddup 8(%0),%%zmm2; vmovddup 64(%0),%%zmm3; vmovddup 72(%0),%%zmm4; addq $128,%0;"\ + unit_acc_m16n2(8,9,10,11,%1) +#endif +#define KERNEL_k1m16n2 KERNEL_h_k1m16n2 "addq $16,%1;" #define KERNEL_h_k1m16n4 KERNEL_h_k1m16n2 "prefetcht0 384(%0);" unit_acc_m16n2(12,13,14,15,%1,%%r12,1) #define KERNEL_k1m16n4 KERNEL_h_k1m16n4 "addq $16,%1;" #define KERNEL_k1m16n6 KERNEL_h_k1m16n4 unit_acc_m16n2(16,17,18,19,%1,%%r12,2) "addq $16,%1;" @@ -151,24 +161,42 @@ #define KERNEL_h_k1m16n12 KERNEL_h_k1m16n10 unit_acc_m16n2(28,29,30,31,%%r15,%%r12,2) #define KERNEL_k1m16n12 KERNEL_h_k1m16n12 "addq $16,%%r15;" #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0) + #ifdef BROADCAST_KERNEL #define loada_kend_k1m16 "vmovupd (%0,%3,1),%%zmm1; vmovupd 64(%0,%3,1),%%zmm2; addq $128,%3;" - #define acc_kend_nc2_k1m16(boff1,boff2) unit_acc_gen_m16n2(12,13,14,15,boff1,boff2,%1,%%r12,1) - #define acc_kend_nc3_k1m16(boff1,boff2) unit_acc_gen_m16n2(16,17,18,19,boff1,boff2,%1,%%r12,2) - #define acc_kend_nc4_k1m16(boff1,boff2) unit_acc_gen_m16n2(20,21,22,23,boff1,boff2,%%r15) - #define acc_kend_nc5_k1m16(boff1,boff2) unit_acc_gen_m16n2(24,25,26,27,boff1,boff2,%%r15,%%r12,1) - #define acc_kend_nc6_k1m16(boff1,boff2) unit_acc_gen_m16n2(28,29,30,31,boff1,boff2,%%r15,%%r12,2) + #else + #define loada_kend_k1m16 "vmovddup (%0,%3,1),%%zmm1; vmovddup 8(%0,%3,1),%%zmm2; vmovddup 64(%0,%3,1),%%zmm3; vmovddup 72(%0,%3,1),%%zmm4; addq $128,%3;" + #endif + #define acc_kend_nc2_k1m16(boff1) unit_acc_gen_m16n2(12,13,14,15,boff1,%1,%%r12,1) + #define acc_kend_nc3_k1m16(boff1) unit_acc_gen_m16n2(16,17,18,19,boff1,%1,%%r12,2) + #define acc_kend_nc4_k1m16(boff1) unit_acc_gen_m16n2(20,21,22,23,boff1,%%r15) + #define acc_kend_nc5_k1m16(boff1) unit_acc_gen_m16n2(24,25,26,27,boff1,%%r15,%%r12,1) + #define acc_kend_nc6_k1m16(boff1) unit_acc_gen_m16n2(28,29,30,31,boff1,%%r15,%%r12,2) #endif #define save_init_m16 "movq %2,%3; addq $128,%2;" #ifdef TRMMKERNEL #define SAVE_m16n1 "vmulpd %%zmm8,%%zmm0,%%zmm8; vmovupd %%zmm8,(%2); vmulpd %%zmm9,%%zmm0,%%zmm9; vmovupd %%zmm9,64(%2); addq $128,%2;" + #ifdef BROADCAST_KERNEL #define unit_save_m16n2(c1_no,c2_no,c3_no,c4_no)\ "vmulpd %%zmm"#c1_no",%%zmm0,%%zmm"#c1_no"; vmovupd %%zmm"#c1_no",(%3); vmulpd %%zmm"#c2_no",%%zmm0,%%zmm"#c2_no"; vmovupd %%zmm"#c2_no",64(%3);"\ "vmulpd %%zmm"#c3_no",%%zmm0,%%zmm"#c3_no"; vmovupd %%zmm"#c3_no",(%3,%4,1); vmulpd %%zmm"#c4_no",%%zmm0,%%zmm"#c4_no"; vmovupd %%zmm"#c4_no",64(%3,%4,1); leaq (%3,%4,2),%3;" + #else + #define unit_save_m16n2(c1_no,c2_no,c3_no,c4_no)\ + "vunpcklpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm1; vunpcklpd %%zmm"#c4_no",%%zmm"#c3_no",%%zmm2; vunpckhpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm3; vunpckhpd %%zmm"#c4_no",%%zmm"#c3_no",%%zmm4;"\ + "vmulpd %%zmm1,%%zmm0,%%zmm1; vmovupd %%zmm1,(%3); vmulpd %%zmm2,%%zmm0,%%zmm2; vmovupd %%zmm2,64(%3);"\ + "vmulpd %%zmm3,%%zmm0,%%zmm3; vmovupd %%zmm3,(%3,%4,1); vmulpd %%zmm4,%%zmm0,%%zmm4; vmovupd %%zmm4,64(%3,%4,1); leaq (%3,%4,2),%3;" + #endif #else #define SAVE_m16n1 "vfmadd213pd (%2),%%zmm0,%%zmm8; vmovupd %%zmm8,(%2); vfmadd213pd 64(%2),%%zmm0,%%zmm9; vmovupd %%zmm9,64(%2); addq $128,%2;" + #ifdef BROADCAST_KERNEL #define unit_save_m16n2(c1_no,c2_no,c3_no,c4_no)\ "vfmadd213pd (%3),%%zmm0,%%zmm"#c1_no"; vmovupd %%zmm"#c1_no",(%3); vfmadd213pd 64(%3),%%zmm0,%%zmm"#c2_no"; vmovupd %%zmm"#c2_no",64(%3);"\ "vfmadd213pd (%3,%4,1),%%zmm0,%%zmm"#c3_no"; vmovupd %%zmm"#c3_no",(%3,%4,1); vfmadd213pd 64(%3,%4,1),%%zmm0,%%zmm"#c4_no"; vmovupd %%zmm"#c4_no",64(%3,%4,1); leaq (%3,%4,2),%3;" + #else + #define unit_save_m16n2(c1_no,c2_no,c3_no,c4_no)\ + "vunpcklpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm1; vunpcklpd %%zmm"#c4_no",%%zmm"#c3_no",%%zmm2; vunpckhpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm3; vunpckhpd %%zmm"#c4_no",%%zmm"#c3_no",%%zmm4;"\ + "vfmadd213pd (%3),%%zmm0,%%zmm1; vmovupd %%zmm1,(%3); vfmadd213pd 64(%3),%%zmm0,%%zmm2; vmovupd %%zmm2,64(%3);"\ + "vfmadd213pd (%3,%4,1),%%zmm0,%%zmm3; vmovupd %%zmm3,(%3,%4,1); vfmadd213pd 64(%3,%4,1),%%zmm0,%%zmm4; vmovupd %%zmm4,64(%3,%4,1); leaq (%3,%4,2),%3;" + #endif #endif #define SAVE_m16n2 save_init_m16 unit_save_m16n2(8,9,10,11) #define SAVE_m16n4 SAVE_m16n2 unit_save_m16n2(12,13,14,15) @@ -206,11 +234,11 @@ #define KERNEL_k1m8n12 KERNEL_h_k1m8n12 "addq $16,%%r15;" #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0) #define loada_kend_k1m8 "vmovddup (%0,%3,1),%%zmm1; vmovddup 8(%0,%3,1),%%zmm2; addq $64,%3;" - #define acc_kend_nc2_k1m8(boff1,boff2) unit_acc_gen_m8n2(10,11,boff1,%1,%%r12,1) - #define acc_kend_nc3_k1m8(boff1,boff2) unit_acc_gen_m8n2(12,13,boff1,%1,%%r12,2) - #define acc_kend_nc4_k1m8(boff1,boff2) unit_acc_gen_m8n2(14,15,boff1,%%r15) - #define acc_kend_nc5_k1m8(boff1,boff2) unit_acc_gen_m8n2(16,17,boff1,%%r15,%%r12,1) - #define acc_kend_nc6_k1m8(boff1,boff2) unit_acc_gen_m8n2(18,19,boff1,%%r15,%%r12,2) + #define acc_kend_nc2_k1m8(boff1) unit_acc_gen_m8n2(10,11,boff1,%1,%%r12,1) + #define acc_kend_nc3_k1m8(boff1) unit_acc_gen_m8n2(12,13,boff1,%1,%%r12,2) + #define acc_kend_nc4_k1m8(boff1) unit_acc_gen_m8n2(14,15,boff1,%%r15) + #define acc_kend_nc5_k1m8(boff1) unit_acc_gen_m8n2(16,17,boff1,%%r15,%%r12,1) + #define acc_kend_nc6_k1m8(boff1) unit_acc_gen_m8n2(18,19,boff1,%%r15,%%r12,2) #endif #define save_init_m8 "movq %2,%3; addq $64,%2;" #ifdef TRMMKERNEL @@ -258,11 +286,11 @@ #define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $16,%%r15;" #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0) #define loada_kend_k1m4 "vmovddup (%0,%3,1),%%ymm1; vmovddup 8(%0,%3,1),%%ymm2; addq $32,%3;" - #define acc_kend_nc2_k1m4(boff1,boff2) unit_acc_gen_m4n2(6,7,boff1,%1,%%r12,1) - #define acc_kend_nc3_k1m4(boff1,boff2) unit_acc_gen_m4n2(8,9,boff1,%1,%%r12,2) - #define acc_kend_nc4_k1m4(boff1,boff2) unit_acc_gen_m4n2(10,11,boff1,%%r15) - #define acc_kend_nc5_k1m4(boff1,boff2) unit_acc_gen_m4n2(12,13,boff1,%%r15,%%r12,1) - #define acc_kend_nc6_k1m4(boff1,boff2) unit_acc_gen_m4n2(14,15,boff1,%%r15,%%r12,2) + #define acc_kend_nc2_k1m4(boff1) unit_acc_gen_m4n2(6,7,boff1,%1,%%r12,1) + #define acc_kend_nc3_k1m4(boff1) unit_acc_gen_m4n2(8,9,boff1,%1,%%r12,2) + #define acc_kend_nc4_k1m4(boff1) unit_acc_gen_m4n2(10,11,boff1,%%r15) + #define acc_kend_nc5_k1m4(boff1) unit_acc_gen_m4n2(12,13,boff1,%%r15,%%r12,1) + #define acc_kend_nc6_k1m4(boff1) unit_acc_gen_m4n2(14,15,boff1,%%r15,%%r12,2) #endif #define save_init_m4 "movq %2,%3; addq $32,%2;" #ifdef TRMMKERNEL @@ -311,11 +339,11 @@ #define KERNEL_k1m2n12 KERNEL_h_k1m2n12 "addq $16,%%r15;" #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0) #define loada_kend_k1m2 "vmovddup (%0,%3,1),%%xmm1; vmovddup 8(%0,%3,1),%%xmm2; addq $16,%3;" - #define acc_kend_nc2_k1m2(boff1,boff2) unit_acc_gen_m2n2(6,7,boff1,%1,%%r12,1) - #define acc_kend_nc3_k1m2(boff1,boff2) unit_acc_gen_m2n2(8,9,boff1,%1,%%r12,2) - #define acc_kend_nc4_k1m2(boff1,boff2) unit_acc_gen_m2n2(10,11,boff1,%%r15) - #define acc_kend_nc5_k1m2(boff1,boff2) unit_acc_gen_m2n2(12,13,boff1,%%r15,%%r12,1) - #define acc_kend_nc6_k1m2(boff1,boff2) unit_acc_gen_m2n2(14,15,boff1,%%r15,%%r12,2) + #define acc_kend_nc2_k1m2(boff1) unit_acc_gen_m2n2(6,7,boff1,%1,%%r12,1) + #define acc_kend_nc3_k1m2(boff1) unit_acc_gen_m2n2(8,9,boff1,%1,%%r12,2) + #define acc_kend_nc4_k1m2(boff1) unit_acc_gen_m2n2(10,11,boff1,%%r15) + #define acc_kend_nc5_k1m2(boff1) unit_acc_gen_m2n2(12,13,boff1,%%r15,%%r12,1) + #define acc_kend_nc6_k1m2(boff1) unit_acc_gen_m2n2(14,15,boff1,%%r15,%%r12,2) #endif #define save_init_m2 "movq %2,%3; addq $16,%2;" #ifdef TRMMKERNEL @@ -362,11 +390,11 @@ #define KERNEL_k1m1n12 KERNEL_h_k1m1n12 "addq $16,%%r15;" #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0) #define loada_kend_k1m1 "vmovddup (%0,%3,1),%%xmm1; addq $8,%3;" - #define acc_kend_nc2_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%1,%%r12,1),%%xmm1,%%xmm5;" - #define acc_kend_nc3_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%1,%%r12,2),%%xmm1,%%xmm6;" - #define acc_kend_nc4_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%%r15),%%xmm1,%%xmm7;" - #define acc_kend_nc5_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%%r15,%%r12,1),%%xmm1,%%xmm8;" - #define acc_kend_nc6_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%%r15,%%r12,2),%%xmm1,%%xmm9;" + #define acc_kend_nc2_k1m1(boff1) "vfmadd231pd "#boff1"(%1,%%r12,1),%%xmm1,%%xmm5;" + #define acc_kend_nc3_k1m1(boff1) "vfmadd231pd "#boff1"(%1,%%r12,2),%%xmm1,%%xmm6;" + #define acc_kend_nc4_k1m1(boff1) "vfmadd231pd "#boff1"(%%r15),%%xmm1,%%xmm7;" + #define acc_kend_nc5_k1m1(boff1) "vfmadd231pd "#boff1"(%%r15,%%r12,1),%%xmm1,%%xmm8;" + #define acc_kend_nc6_k1m1(boff1) "vfmadd231pd "#boff1"(%%r15,%%r12,2),%%xmm1,%%xmm9;" #endif #define save_init_m1 "movq %2,%3; addq $8,%2;" #ifdef TRMMKERNEL diff --git a/kernel/x86_64/dgemm_tcopy_16_skylakex.c b/kernel/x86_64/dgemm_tcopy_16_skylakex.c new file mode 100644 index 000000000..a1da60f8f --- /dev/null +++ b/kernel/x86_64/dgemm_tcopy_16_skylakex.c @@ -0,0 +1,129 @@ +#include +#include "common.h" +#include + +int CNAME(BLASLONG dim_second, BLASLONG dim_first, double *src, BLASLONG lead_dim, double *dst){ + double *src1, *src2, *src3, *src4, *dst1; + __m512d z1,z2,z3,z4,z5,z6,z7,z8; __m256d y1,y2,y3,y4; __m128d x1,x2,x3,x4; double s1,s2,s3,s4; + BLASLONG dim1_count, dim2_count, src_inc; + src_inc = 4 * lead_dim - dim_first; + src1 = src; src2 = src + lead_dim; src3 = src2 + lead_dim; src4 = src3 + lead_dim; + for(dim2_count=dim_second; dim2_count>3; dim2_count-=4){ + dst1 = dst + 16 * (dim_second - dim2_count); + for(dim1_count=dim_first; dim1_count>15; dim1_count-=16){ + z1 = _mm512_loadu_pd(src1); z2 = _mm512_loadu_pd(src1+8); src1 += 16; + z3 = _mm512_loadu_pd(src2); z4 = _mm512_loadu_pd(src2+8); src2 += 16; + z5 = _mm512_loadu_pd(src3); z6 = _mm512_loadu_pd(src3+8); src3 += 16; + z7 = _mm512_loadu_pd(src4); z8 = _mm512_loadu_pd(src4+8); src4 += 16; + _mm512_storeu_pd(dst1+ 0,z1); _mm512_storeu_pd(dst1+ 8,z2); + _mm512_storeu_pd(dst1+16,z3); _mm512_storeu_pd(dst1+24,z4); + _mm512_storeu_pd(dst1+32,z5); _mm512_storeu_pd(dst1+40,z6); + _mm512_storeu_pd(dst1+48,z7); _mm512_storeu_pd(dst1+56,z8); dst1 += 16 * dim_second; + } + dst1 -= 8 * (dim_second - dim2_count); + if(dim1_count>7){ + z1 = _mm512_loadu_pd(src1); src1 += 8; + z2 = _mm512_loadu_pd(src2); src2 += 8; + z3 = _mm512_loadu_pd(src3); src3 += 8; + z4 = _mm512_loadu_pd(src4); src4 += 8; + _mm512_storeu_pd(dst1+ 0,z1); _mm512_storeu_pd(dst1+ 8,z2); + _mm512_storeu_pd(dst1+16,z3); _mm512_storeu_pd(dst1+24,z4); dst1 += 8 * dim_second; + dim1_count -= 8; + } + dst1 -= 4 * (dim_second - dim2_count); + if(dim1_count>3){ + y1 = _mm256_loadu_pd(src1); src1 += 4; + y2 = _mm256_loadu_pd(src2); src2 += 4; + y3 = _mm256_loadu_pd(src3); src3 += 4; + y4 = _mm256_loadu_pd(src4); src4 += 4; + _mm256_storeu_pd(dst1+ 0,y1); _mm256_storeu_pd(dst1+ 4,y2); + _mm256_storeu_pd(dst1+ 8,y3); _mm256_storeu_pd(dst1+12,y4); dst1 += 4 * dim_second; + dim1_count -= 4; + } + dst1 -= 2 * (dim_second - dim2_count); + if(dim1_count>1){ + x1 = _mm_loadu_pd(src1); src1 += 2; + x2 = _mm_loadu_pd(src2); src2 += 2; + x3 = _mm_loadu_pd(src3); src3 += 2; + x4 = _mm_loadu_pd(src4); src4 += 2; + _mm_storeu_pd(dst1+0,x1); _mm_storeu_pd(dst1+2,x2); + _mm_storeu_pd(dst1+4,x3); _mm_storeu_pd(dst1+6,x4); dst1 += 2 * dim_second; + dim1_count -= 2; + } + dst1 -= dim_second - dim2_count; + if(dim1_count>0){ + s1 = *src1; src1++; s2 = *src2; src2++; s3 = *src3; src3++; s4 = *src4; src4++; + dst1[0] = s1; dst1[1] = s2; dst1[2] = s3; dst1[3] = s4; + } + src1 += src_inc; src2 += src_inc; src3 += src_inc; src4 += src_inc; + } + src_inc -= 2 * lead_dim; + for(; dim2_count>1; dim2_count-=2){ + dst1 = dst + 16 * (dim_second - dim2_count); + for(dim1_count=dim_first; dim1_count>15; dim1_count-=16){ + z1 = _mm512_loadu_pd(src1); z2 = _mm512_loadu_pd(src1+8); src1 += 16; + z3 = _mm512_loadu_pd(src2); z4 = _mm512_loadu_pd(src2+8); src2 += 16; + _mm512_storeu_pd(dst1+ 0,z1); _mm512_storeu_pd(dst1+ 8,z2); + _mm512_storeu_pd(dst1+16,z3); _mm512_storeu_pd(dst1+24,z4); dst1 += 16 * dim_second; + } + dst1 -= 8 * (dim_second - dim2_count); + if(dim1_count>7){ + z1 = _mm512_loadu_pd(src1); src1 += 8; + z2 = _mm512_loadu_pd(src2); src2 += 8; + _mm512_storeu_pd(dst1+ 0,z1); _mm512_storeu_pd(dst1+ 8,z2); dst1 += 8 * dim_second; + dim1_count -= 8; + } + dst1 -= 4 * (dim_second - dim2_count); + if(dim1_count>3){ + y1 = _mm256_loadu_pd(src1); src1 += 4; + y2 = _mm256_loadu_pd(src2); src2 += 4; + _mm256_storeu_pd(dst1+ 0,y1); _mm256_storeu_pd(dst1+ 4,y2); dst1 += 4 * dim_second; + dim1_count -= 4; + } + dst1 -= 2 * (dim_second - dim2_count); + if(dim1_count>1){ + x1 = _mm_loadu_pd(src1); src1 += 2; + x2 = _mm_loadu_pd(src2); src2 += 2; + _mm_storeu_pd(dst1+0,x1); _mm_storeu_pd(dst1+2,x2); dst1 += 2 * dim_second; + dim1_count -= 2; + } + dst1 -= dim_second - dim2_count; + if(dim1_count>0){ + s1 = *src1; src1++; s2 = *src2; src2++; + dst1[0] = s1; dst1[1] = s2; + } + src1 += src_inc; src2 += src_inc; + } + src_inc -= lead_dim; + for(; dim2_count>0; dim2_count--){ + dst1 = dst + 16 * (dim_second - dim2_count); + for(dim1_count=dim_first; dim1_count>15; dim1_count-=16){ + z1 = _mm512_loadu_pd(src1); z2 = _mm512_loadu_pd(src1+8); src1 += 16; + _mm512_storeu_pd(dst1+ 0,z1); _mm512_storeu_pd(dst1+ 8,z2); dst1 += 16 * dim_second; + } + dst1 -= 8 * (dim_second - dim2_count); + if(dim1_count>7){ + z1 = _mm512_loadu_pd(src1); src1 += 8; + _mm512_storeu_pd(dst1+ 0,z1); dst1 += 8 * dim_second; + dim1_count -= 8; + } + dst1 -= 4 * (dim_second - dim2_count); + if(dim1_count>3){ + y1 = _mm256_loadu_pd(src1); src1 += 4; + _mm256_storeu_pd(dst1+ 0,y1); dst1 += 4 * dim_second; + dim1_count -= 4; + } + dst1 -= 2 * (dim_second - dim2_count); + if(dim1_count>1){ + x1 = _mm_loadu_pd(src1); src1 += 2; + _mm_storeu_pd(dst1+0,x1); dst1 += 2 * dim_second; + dim1_count -= 2; + } + dst1 -= dim_second - dim2_count; + if(dim1_count>0){ + s1 = *src1; src1++; + dst1[0] = s1; + } + src1 += src_inc; + } +} diff --git a/kernel/x86_64/dsymv_L_microk_skylakex-2.c b/kernel/x86_64/dsymv_L_microk_skylakex-2.c index bdcd914fb..f0df5aaa8 100644 --- a/kernel/x86_64/dsymv_L_microk_skylakex-2.c +++ b/kernel/x86_64/dsymv_L_microk_skylakex-2.c @@ -36,7 +36,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__clang_patchlevel__) && __clang_major__ == 9 && __clang_minor__ == 0 && __clang_patchlevel__ == 0 #pragma clang optimize off #endif - +#if defined(__apple_build_version__) && __clang_major__ == 11 && __clang_minor__ == 0 && __clang_patchlevel__ == 3 +#pragma clang optimize off +#endif static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) { @@ -164,6 +166,9 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL #if defined(__clang_patchlevel__) && __clang_major__ == 9 && __clang_minor__ == 0 && __clang_patchlevel__ == 0 #pragma clang optimize on #endif +#if defined(__apple_build_version__) && __clang_major__ == 11 && __clang_minor__ == 0 && __clang_patchlevel__ == 3 +#pragma clang optimize on +#endif #else #include "dsymv_L_microk_haswell-2.c" diff --git a/kernel/x86_64/sgemm_kernel_8x4_haswell_2.c b/kernel/x86_64/sgemm_kernel_8x4_haswell_2.c index 5ab3e6d1f..a2e78c58d 100644 --- a/kernel/x86_64/sgemm_kernel_8x4_haswell_2.c +++ b/kernel/x86_64/sgemm_kernel_8x4_haswell_2.c @@ -1,4 +1,4 @@ -/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store, %6 = b_pref */ +/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store, %6 = b_pref */ /* r10 = tmp, r11 = m_counter, r12 = k << 2(const), r13 = tmp, r14 = b_head_pos(const), r15 = tmp */ /* m = 8 *//* ymm0 for alpha, ymm1-ymm3 for temporary use, ymm4-ymm15 for accumulators */ diff --git a/kernel/x86_64/strsm_kernel_8x4_haswell_LN.c b/kernel/x86_64/strsm_kernel_8x4_haswell_LN.c index 4131debb1..5410bd4ae 100644 --- a/kernel/x86_64/strsm_kernel_8x4_haswell_LN.c +++ b/kernel/x86_64/strsm_kernel_8x4_haswell_LN.c @@ -1,4 +1,4 @@ -#include "common.h" +#include "common.h" #include #include "strsm_kernel_8x4_haswell_L_common.h" diff --git a/kernel/x86_64/strsm_kernel_8x4_haswell_L_common.h b/kernel/x86_64/strsm_kernel_8x4_haswell_L_common.h index cfa56da97..2862a5b8d 100644 --- a/kernel/x86_64/strsm_kernel_8x4_haswell_L_common.h +++ b/kernel/x86_64/strsm_kernel_8x4_haswell_L_common.h @@ -1,4 +1,4 @@ -/* r11 = m_counter, r12 = size_of_k_elements, r13 = kk, r14 = b_head, r15 = a_head */ +/* r11 = m_counter, r12 = size_of_k_elements, r13 = kk, r14 = b_head, r15 = a_head */ /* register i/o: %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc, %5 = k_counter */ /* memory input: %6 = K, %7 = offset, %8 = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}, %9 = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}, %10 = M */ diff --git a/kernel/zarch/KERNEL.Z14 b/kernel/zarch/KERNEL.Z14 index f6e3bec23..3510938a7 100644 --- a/kernel/zarch/KERNEL.Z14 +++ b/kernel/zarch/KERNEL.Z14 @@ -86,24 +86,24 @@ DGEMVTKERNEL = dgemv_t_4.c CGEMVTKERNEL = cgemv_t_4.c ZGEMVTKERNEL = zgemv_t_4.c -STRMMKERNEL = strmm8x4V.S -DTRMMKERNEL = trmm8x4V.S +STRMMKERNEL = gemm_vec.c +DTRMMKERNEL = gemm_vec.c CTRMMKERNEL = ctrmm4x4V.S ZTRMMKERNEL = ztrmm4x4V.S -SGEMMKERNEL = strmm8x4V.S -SGEMMINCOPY = ../generic/gemm_ncopy_8.c -SGEMMITCOPY = ../generic/gemm_tcopy_8.c -SGEMMONCOPY = ../generic/gemm_ncopy_4.c -SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMKERNEL = gemm_vec.c +ifneq ($(SGEMM_UNROLL_M),$(SGEMM_UNROLL_N)) +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) - - -DGEMMKERNEL = gemm8x4V.S +DGEMMKERNEL = gemm_vec.c DGEMMINCOPY = ../generic/gemm_ncopy_8.c DGEMMITCOPY = ../generic/gemm_tcopy_8.c DGEMMONCOPY = ../generic/gemm_ncopy_4.c @@ -145,7 +145,3 @@ ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - - - - diff --git a/kernel/zarch/gemm_vec.c b/kernel/zarch/gemm_vec.c new file mode 100644 index 000000000..eb6d7700b --- /dev/null +++ b/kernel/zarch/gemm_vec.c @@ -0,0 +1,475 @@ +/* + * Copyright (c) IBM Corporation 2020. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include "common.h" +#include + +#include +#include +#include + +#ifdef COMPLEX +#error "Handling for complex numbers is not supported in this kernel" +#endif + +#ifdef DOUBLE +#define UNROLL_M DGEMM_DEFAULT_UNROLL_M +#define UNROLL_N DGEMM_DEFAULT_UNROLL_N +#else +#define UNROLL_M SGEMM_DEFAULT_UNROLL_M +#define UNROLL_N SGEMM_DEFAULT_UNROLL_N +#endif + +static const size_t unroll_m = UNROLL_M; +static const size_t unroll_n = UNROLL_N; + +/* Handling of triangular matrices */ +#ifdef TRMMKERNEL +static const bool trmm = true; +static const bool left = +#ifdef LEFT + true; +#else + false; +#endif + +static const bool backwards = +#if defined(LEFT) != defined(TRANSA) + true; +#else + false; +#endif + +#else +static const bool trmm = false; +static const bool left = false; +static const bool backwards = false; +#endif /* TRMMKERNEL */ + +/* + * Background: + * + * The algorithm of GotoBLAS / OpenBLAS breaks down the matrix multiplication + * problem by splitting all matrices into partitions multiple times, so that the + * submatrices fit into the L1 or L2 caches. As a result, each multiplication of + * submatrices can stream data fast from L1 and L2 caches. Inbetween, it copies + * and rearranges the submatrices to enable contiguous memory accesses to + * improve locality in both caches and TLBs. + * + * At the heart of the algorithm is this kernel, which multiplies, a "Block + * matrix" A (small dimensions) with a "Panel matrix" B (number of rows is + * small) and adds the result into a "Panel matrix" C; GotoBLAS calls this + * operation GEBP. This kernel further partitions GEBP twice, such that (1) + * submatrices of C and B fit into the L1 caches (GEBP_column_block) and (2) a + * block of C fits into the registers, while multiplying panels from A and B + * streamed from the L2 and L1 cache, respectively (GEBP_block). + * + * + * Algorithm GEBP(A, B, C, m, n, k, alpha): + * + * The problem is calculating C += alpha * (A * B) + * C is an m x n matrix, A is an m x k matrix, B is an k x n matrix. + * + * - C is in column-major-order, with an offset of ldc to the element in the + * next column (same row). + * - A is in row-major-order yet stores SGEMM_UNROLL_M elements of each column + * contiguously while walking along rows. + * - B is in column-major-order but packs SGEMM_UNROLL_N elements of a row + * contiguously. + * If the numbers of rows and columns are not multiples of SGEMM_UNROLL_M or + * SGEMM_UNROLL_N, the remaining elements are arranged in blocks with power-of-2 + * dimensions (e.g., 5 remaining columns would be in a block-of-4 and a + * block-of-1). + * + * Note that packing A and B into that form is taken care of by the caller in + * driver/level3/level3.c (actually done by "copy kernels"). + * + * Steps: + * - Partition C and B into blocks of n_r (SGEMM_UNROLL_N) columns, C_j and B_j. + * Now, B_j should fit into the L1 cache. + * - For each partition, calculate C_j += alpha * (A * B_j) by + * (1) Calculate C_aux := A * B_j (see below) + * (2) unpack C_j = C_j + alpha * C_aux + * + * + * Algorithm for Calculating C_aux: + * + * - Further partition C_aux and A into groups of m_r (SGEMM_UNROLL_M) rows, + * such that the m_r x n_r-submatrix of C_aux can be held in registers. Each + * submatrix of C_aux can be calculated independently, and the registers are + * added back into C_j. + * + * - For each row-block of C_aux: + * (uses a row block of A and full B_j) + * - stream over all columns of A, multiply with elements from B and + * accumulate in registers. (use different inner-kernels to exploit + * vectorization for varying block sizes) + * - add alpha * row block of C_aux back into C_j. + * + * Note that there are additional mechanics for handling triangular matrices, + * calculating B := alpha (A * B) where either of the matrices A or B can be + * triangular. In case of A, the macro "LEFT" is defined. In addition, A can + * optionally be transposed. + * The code effectively skips an "offset" number of columns in A and rows of B + * in each block, to save unnecessary work by exploiting the triangular nature. + * To handle all cases, the code discerns (1) a "left" mode when A is triangular + * and (2) "forward" / "backwards" modes where only the first "offset" + * columns/rows of A/B are used or where the first "offset" columns/rows are + * skipped, respectively. + * + * Reference: + * + * The summary above is based on staring at various kernel implementations and: + * K. Goto and R. A. Van de Geijn, Anatomy of High-Performance Matrix + * Multiplication, in ACM Transactions of Mathematical Software, Vol. 34, No. + * 3, May 2008. + */ + +#define VLEN_BYTES 16 +#define VLEN_FLOATS (VLEN_BYTES / sizeof(FLOAT)) + +typedef FLOAT vector_float __attribute__ ((vector_size (16))); + +/** + * Load a vector into register, and hint on 8-byte alignment to improve + * performance. gcc-9 and newer will create these hints by itself. For older + * compiler versions, use inline assembly to explicitly express the hint. + * Provide explicit hex encoding to cater for binutils versions that do not know + * about vector-load with alignment hints yet. + * + * Note that, for block sizes where we apply vectorization, vectors in A will + * always be 8-byte aligned. + */ +static inline vector_float vec_load_hinted(FLOAT const *restrict a) { + vector_float const *restrict addr = (vector_float const *restrict)a; + vector_float y; + +#if __GNUC__ < 9 + // hex-encode vl %[out],%[addr],3 + asm(".insn vrx,0xe70000003006,%[out],%[addr],3" + : [ out ] "=v"(y) + : [ addr ] "R"(*addr)); +#else + y = *addr; +#endif + + return y; +} + +/** + * Calculate for a row-block in C_i of size ROWSxCOLS using vector intrinsics. + * + * @param[in] A Pointer current block of input matrix A. + * @param[in] k Number of columns in A. + * @param[in] B Pointer current block of input matrix B. + * @param[inout] C Pointer current block of output matrix C. + * @param[in] ldc Offset between elements in adjacent columns in C. + * @param[in] alpha Scalar factor. + */ +#define VECTOR_BLOCK(ROWS, COLS) \ + static inline void GEBP_block_##ROWS##_##COLS( \ + FLOAT const *restrict A, BLASLONG bk, FLOAT const *restrict B, \ + FLOAT *restrict C, BLASLONG ldc, FLOAT alpha) { \ + _Static_assert( \ + ROWS % VLEN_FLOATS == 0, \ + "rows in block must be multiples of vector length"); \ + vector_float Caux[ROWS / VLEN_FLOATS][COLS]; \ + \ + for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) { \ + vector_float A0 = \ + vec_load_hinted(A + i * VLEN_FLOATS); \ + for (BLASLONG j = 0; j < COLS; j++) \ + Caux[i][j] = A0 * B[j]; \ + } \ + \ + /* \ + * Stream over the row-block of A, which is packed \ + * column-by-column, multiply by coefficients in B and add up \ + * into temporaries Caux (which the compiler will hold in \ + * registers). Vectorization: Multiply column vectors from A \ + * with scalars from B and add up in column vectors of Caux. \ + * That equates to unrolling the loop over rows (in i) and \ + * executing each unrolled iteration as a vector element. \ + */ \ + for (BLASLONG k = 1; k < bk; k++) { \ + for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) { \ + vector_float Ak = vec_load_hinted( \ + A + i * VLEN_FLOATS + k * ROWS); \ + \ + for (BLASLONG j = 0; j < COLS; j++) \ + Caux[i][j] += Ak * B[j + k * COLS]; \ + } \ + } \ + \ + /* \ + * Unpack row-block of C_aux into outer C_i, multiply by \ + * alpha and add up. \ + */ \ + for (BLASLONG j = 0; j < COLS; j++) { \ + for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) { \ + vector_float *C_ij = \ + (vector_float *)(C + i * VLEN_FLOATS + \ + j * ldc); \ + if (trmm) { \ + *C_ij = alpha * Caux[i][j]; \ + } else { \ + *C_ij += alpha * Caux[i][j]; \ + } \ + } \ + } \ + } + + +#if UNROLL_M == 16 +VECTOR_BLOCK(16, 4) +VECTOR_BLOCK(16, 2) +VECTOR_BLOCK(16, 1) +#endif +#if UNROLL_N == 8 +VECTOR_BLOCK(8, 8) +VECTOR_BLOCK(4, 8) +#endif +VECTOR_BLOCK(8, 4) +VECTOR_BLOCK(8, 2) +VECTOR_BLOCK(8, 1) +VECTOR_BLOCK(4, 4) +VECTOR_BLOCK(4, 2) +VECTOR_BLOCK(4, 1) + +#ifdef DOUBLE +VECTOR_BLOCK(2, 4) +VECTOR_BLOCK(2, 2) +#endif + +/** + * Handle calculation for row blocks in C_i of any size by dispatching into + * macro-defined (inline) functions or by deferring to a simple generic + * implementation. Note that the compiler can remove this awkward-looking + * dispatching code while inlineing. + * + * @param[in] m Number of rows in block C_i. + * @param[in] n Number of columns in block C_i. + * @param[in] first_row Index of first row of the block C_i (relative to C). + * @param[in] A Pointer to input matrix A (note: all of it). + * @param[in] k Number of columns in A and rows in B. + * @param[in] B Pointer to current column block (panel) of input matrix B. + * @param[inout] C Pointer to current column block (panel) of output matrix C. + * @param[in] ldc Offset between elements in adjacent columns in C. + * @param[in] alpha Scalar factor. + * @param[in] offset Number of columns of A and rows of B to skip (for triangular matrices). + * @param[in] off Running offset for handling triangular matrices. + */ +static inline void GEBP_block(BLASLONG m, BLASLONG n, + BLASLONG first_row, + const FLOAT * restrict A, BLASLONG k, + const FLOAT * restrict B, + FLOAT *restrict C, BLASLONG ldc, + FLOAT alpha, + BLASLONG offset, BLASLONG off) +{ + if (trmm && left) + off = offset + first_row; + + A += first_row * k; + C += first_row; + + if (trmm) { + if (backwards) { + A += off * m; + B += off * n; + k -= off; + } else { + if (left) { + k = off + m; + } else { + k = off + n; + } + } + } + +#define BLOCK(bm, bn) \ + if (m == bm && n == bn) { \ + GEBP_block_##bm##_##bn(A, k, B, C, ldc, alpha); \ + return; \ + } + +#if UNROLL_M == 16 + BLOCK(16, 4); BLOCK(16, 2); BLOCK(16, 1); +#endif +#if UNROLL_N == 8 + BLOCK(8, 8); BLOCK(4, 8); +#endif + BLOCK(8, 4); BLOCK(8, 2); BLOCK(8, 1); + BLOCK(4, 4); BLOCK(4, 2); BLOCK(4, 1); + + #ifdef DOUBLE + BLOCK(2, 4); + BLOCK(2, 2); + #endif + +#undef BLOCK + + /* simple implementation for smaller block sizes: */ + FLOAT Caux[m][n] __attribute__ ((aligned (16))); + + /* + * Peel off first iteration (i.e., column of A) for initializing Caux + */ + for (BLASLONG i = 0; i < m; i++) + for (BLASLONG j = 0; j < n; j++) + Caux[i][j] = A[i] * B[j]; + + for (BLASLONG kk = 1; kk < k; kk++) + for (BLASLONG i = 0; i < m; i++) + for (BLASLONG j = 0; j < n; j++) + Caux[i][j] += A[i + kk * m] * B[j + kk * n]; + + for (BLASLONG i = 0; i < m; i++) + for (BLASLONG j = 0; j < n; j++) + if (trmm) { + C[i + j * ldc] = alpha * Caux[i][j]; + } else { + C[i + j * ldc] += alpha * Caux[i][j]; + } +} + +/** + * Handle a column block (panel) of C and B while calculating C += alpha(A * B). + * + * @param[in] num_cols Number of columns in the block (in C and B). + * @param[in] first_col First column of the current block (in C and B). + * @param[in] A Pointer to input matrix A. + * @param[in] bk Number of columns in A and rows in B. + * @param[in] B Pointer to input matrix B (note: all of it). + * @param[in] bm Number of rows in C and A. + * @param[inout] C Pointer to output matrix C (note: all of it). + * @param[in] ldc Offset between elements in adjacent columns in C. + * @param[in] alpha Scalar factor. + * @param[in] offset Number of columns of A and rows of B to skip (for triangular matrices). + */ +static inline void GEBP_column_block(BLASLONG num_cols, BLASLONG first_col, + const FLOAT *restrict A, BLASLONG bk, + const FLOAT *restrict B, BLASLONG bm, + FLOAT *restrict C, BLASLONG ldc, + FLOAT alpha, + BLASLONG const offset) { + + FLOAT *restrict C_i = C + first_col * ldc; + /* + * B is in column-order with n_r packed row elements, which does + * not matter -- we always move in full such blocks of + * column*pack + */ + const FLOAT *restrict B_i = B + first_col * bk; + + BLASLONG off = 0; + if (trmm) { + if (left) { + off = offset; + } else { + off = -offset + first_col; + } + } + + /* + * Calculate C_aux := A * B_j + * then unpack C_i += alpha * C_aux. + * + * For that purpose, further partition C_aux and A into blocks + * of m_r (unroll_m) rows, or powers-of-2 if smaller. + */ + BLASLONG row = 0; + for (BLASLONG block_size = unroll_m; block_size > 0; block_size /= 2) + for (; bm - row >= block_size; row += block_size) + GEBP_block(block_size, num_cols, row, A, bk, B_i, C_i, + ldc, alpha, offset, off); +} + +/** + * Inner kernel for matrix-matrix multiplication. C += alpha (A * B) + * where C is an m-by-n matrix, A is m-by-k and B is k-by-n. Note that A, B, and + * C are pointers to submatrices of the actual matrices. + * + * For triangular matrix multiplication, calculate B := alpha (A * B) where A + * or B can be triangular (in case of A, the macro LEFT will be defined). + * + * @param[in] bm Number of rows in C and A. + * @param[in] bn Number of columns in C and B. + * @param[in] bk Number of columns in A and rows in B. + * @param[in] alpha Scalar factor. + * @param[in] ba Pointer to input matrix A. + * @param[in] bb Pointer to input matrix B. + * @param[inout] C Pointer to output matrix C. + * @param[in] ldc Offset between elements in adjacent columns in C. + * @param[in] offset Number of columns of A and rows of B to skip (for triangular matrices). + * @returns 0 on success. + */ +int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, + FLOAT *restrict ba, FLOAT *restrict bb, + FLOAT *restrict C, BLASLONG ldc +#ifdef TRMMKERNEL + , BLASLONG offset +#endif + ) +{ + if ( (bm == 0) || (bn == 0) || (bk == 0) || (alpha == ZERO)) + return 0; + + /* + * interface code allocates buffers for ba and bb at page + * granularity (i.e., using mmap(MAP_ANONYMOUS), so enable the compiler + * to make use of the fact in vector load operations. + */ + ba = __builtin_assume_aligned(ba, 16); + bb = __builtin_assume_aligned(bb, 16); + + /* + * Use offset and off even when compiled as SGEMMKERNEL to simplify + * function signatures and function calls. + */ +#ifndef TRMMKERNEL + BLASLONG const offset = 0; +#endif + + /* + * Partition B and C into blocks of n_r (unroll_n) columns, called B_i + * and C_i. For each partition, calculate C_i += alpha * (A * B_j). + * + * For remaining columns that do not fill up a block of n_r, iteratively + * use smaller block sizes of powers of 2. + */ + BLASLONG col = 0; + for (BLASLONG block_size = unroll_n; block_size > 0; block_size /= 2) + for (; bn - col >= block_size; col += block_size) + GEBP_column_block(block_size, col, ba, bk, bb, bm, C, ldc, alpha, offset); + + return 0; +} diff --git a/lapack-netlib/LAPACKE/include/lapacke.h b/lapack-netlib/LAPACKE/include/lapacke.h index 6eb0b696b..012c104bb 100644 --- a/lapack-netlib/LAPACKE/include/lapacke.h +++ b/lapack-netlib/LAPACKE/include/lapacke.h @@ -12575,7 +12575,7 @@ lapack_int LAPACKE_zhetrs_aa_2stage_work( int matrix_layout, char uplo, lapack_i /* APIs for set/get nancheck flags */ void LAPACKE_set_nancheck( int flag ); -int LAPACKE_get_nancheck( ); +int LAPACKE_get_nancheck( void ); #ifdef __cplusplus } diff --git a/lapack-netlib/SRC/clargv.f b/lapack-netlib/SRC/clargv.f index ba53cae6f..36c5108df 100644 --- a/lapack-netlib/SRC/clargv.f +++ b/lapack-netlib/SRC/clargv.f @@ -200,7 +200,7 @@ FS = FS*SAFMN2 GS = GS*SAFMN2 SCALE = SCALE*SAFMN2 - IF( SCALE.GE.SAFMX2 ) + IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20 ) $ GO TO 10 ELSE IF( SCALE.LE.SAFMN2 ) THEN IF( G.EQ.CZERO ) THEN diff --git a/lapack-netlib/SRC/clartg.f b/lapack-netlib/SRC/clartg.f index da9a1cdef..baa68b657 100644 --- a/lapack-netlib/SRC/clartg.f +++ b/lapack-netlib/SRC/clartg.f @@ -161,7 +161,7 @@ FS = FS*SAFMN2 GS = GS*SAFMN2 SCALE = SCALE*SAFMN2 - IF( SCALE.GE.SAFMX2 ) + IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20) $ GO TO 10 ELSE IF( SCALE.LE.SAFMN2 ) THEN IF( G.EQ.CZERO.OR.SISNAN( ABS( G ) ) ) THEN diff --git a/lapack-netlib/SRC/dlartg.f b/lapack-netlib/SRC/dlartg.f index 1c7c46f63..dc49986a0 100644 --- a/lapack-netlib/SRC/dlartg.f +++ b/lapack-netlib/SRC/dlartg.f @@ -163,7 +163,7 @@ F1 = F1*SAFMN2 G1 = G1*SAFMN2 SCALE = MAX( ABS( F1 ), ABS( G1 ) ) - IF( SCALE.GE.SAFMX2 ) + IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20) $ GO TO 10 R = SQRT( F1**2+G1**2 ) CS = F1 / R diff --git a/lapack-netlib/SRC/dlartgp.f b/lapack-netlib/SRC/dlartgp.f index 0cb0d2d13..334e416e8 100644 --- a/lapack-netlib/SRC/dlartgp.f +++ b/lapack-netlib/SRC/dlartgp.f @@ -161,7 +161,7 @@ F1 = F1*SAFMN2 G1 = G1*SAFMN2 SCALE = MAX( ABS( F1 ), ABS( G1 ) ) - IF( SCALE.GE.SAFMX2 ) + IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20 ) $ GO TO 10 R = SQRT( F1**2+G1**2 ) CS = F1 / R diff --git a/lapack-netlib/SRC/slartg.f b/lapack-netlib/SRC/slartg.f index 784d4bc36..307c9c83a 100644 --- a/lapack-netlib/SRC/slartg.f +++ b/lapack-netlib/SRC/slartg.f @@ -163,7 +163,7 @@ F1 = F1*SAFMN2 G1 = G1*SAFMN2 SCALE = MAX( ABS( F1 ), ABS( G1 ) ) - IF( SCALE.GE.SAFMX2 ) + IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20) $ GO TO 10 R = SQRT( F1**2+G1**2 ) CS = F1 / R diff --git a/lapack-netlib/SRC/slartgp.f b/lapack-netlib/SRC/slartgp.f index ad76c94b4..f8be5f52b 100644 --- a/lapack-netlib/SRC/slartgp.f +++ b/lapack-netlib/SRC/slartgp.f @@ -161,7 +161,7 @@ F1 = F1*SAFMN2 G1 = G1*SAFMN2 SCALE = MAX( ABS( F1 ), ABS( G1 ) ) - IF( SCALE.GE.SAFMX2 ) + IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20) $ GO TO 10 R = SQRT( F1**2+G1**2 ) CS = F1 / R diff --git a/lapack-netlib/SRC/zheequb.f b/lapack-netlib/SRC/zheequb.f index d698232e8..7d719f41e 100644 --- a/lapack-netlib/SRC/zheequb.f +++ b/lapack-netlib/SRC/zheequb.f @@ -271,7 +271,7 @@ AVG = AVG / N STD = 0.0D0 - DO I = N+1, N + DO I = N+1, 2*N WORK( I ) = S( I-N ) * WORK( I-N ) - AVG END DO CALL ZLASSQ( N, WORK( N+1 ), 1, SCALE, SUMSQ ) diff --git a/lapack-netlib/SRC/zlargv.f b/lapack-netlib/SRC/zlargv.f index 1e17983d5..f83ca1851 100644 --- a/lapack-netlib/SRC/zlargv.f +++ b/lapack-netlib/SRC/zlargv.f @@ -201,7 +201,7 @@ FS = FS*SAFMN2 GS = GS*SAFMN2 SCALE = SCALE*SAFMN2 - IF( SCALE.GE.SAFMX2 ) + IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20 ) $ GO TO 10 ELSE IF( SCALE.LE.SAFMN2 ) THEN IF( G.EQ.CZERO ) THEN diff --git a/lapack-netlib/SRC/zlartg.f b/lapack-netlib/SRC/zlartg.f index 8989bb896..894b4ded0 100644 --- a/lapack-netlib/SRC/zlartg.f +++ b/lapack-netlib/SRC/zlartg.f @@ -161,7 +161,7 @@ FS = FS*SAFMN2 GS = GS*SAFMN2 SCALE = SCALE*SAFMN2 - IF( SCALE.GE.SAFMX2 ) + IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20 ) $ GO TO 10 ELSE IF( SCALE.LE.SAFMN2 ) THEN IF( G.EQ.CZERO.OR.DISNAN( ABS( G ) ) ) THEN diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt index e21a9aabb..778e6f8fa 100644 --- a/lapack/CMakeLists.txt +++ b/lapack/CMakeLists.txt @@ -2,6 +2,7 @@ include_directories(${PROJECT_SOURCE_DIR}) include_directories(${PROJECT_BINARY_DIR}) +list (REMOVE_ITEM FLOAT_TYPES "HALF") set(LAPACK_SOURCES potrf/potrf_U_single.c @@ -45,6 +46,9 @@ GenerateNamedObjects("laswp/generic/laswp_k_4.c" "" "laswp_plus" false "" "" fa GenerateNamedObjects("laswp/generic/laswp_k_4.c" "MINUS" "laswp_minus" false "" "" false 3) foreach (float_type ${FLOAT_TYPES}) +if (${float_type} STREQUAL "HALF") + continue() +endif() GenerateNamedObjects("getrf/getrf_single.c" "UNIT" "getrf_single" false "" "" false ${float_type}) endforeach () diff --git a/lapack/getrf/potrf_parallel.c b/lapack/getrf/potrf_parallel.c index c2fee6bd1..312509685 100644 --- a/lapack/getrf/potrf_parallel.c +++ b/lapack/getrf/potrf_parallel.c @@ -380,6 +380,9 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){ #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_REAL; mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1; +#elif defined(HALF) + mode = BLAS_HALF | BLAS_REAL; + mask = MAX(SHGEMM_UNROLL_M, SHGEMM_UNROLL_N) - 1; #else mode = BLAS_SINGLE | BLAS_REAL; mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1; diff --git a/param.h b/param.h index d6cbe544a..e8cf53f0a 100644 --- a/param.h +++ b/param.h @@ -72,6 +72,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef PARAM_H #define PARAM_H +#define SHGEMM_DEFAULT_UNROLL_N 4 +#define SHGEMM_DEFAULT_UNROLL_M 8 +#define SHGEMM_DEFAULT_UNROLL_MN 32 +#define SHGEMM_DEFAULT_P 256 +#define SHGEMM_DEFAULT_R 256 +#define SHGEMM_DEFAULT_Q 256 #ifdef OPTERON #define SNUMOPT 4 @@ -1968,7 +1974,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_N 4 -#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 @@ -2248,13 +2254,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_R 4096 #define DGEMM_DEFAULT_R 4096 #define CGEMM_DEFAULT_R 4096 -#define ZGEMM_DEFAULT_R 512 +#define ZGEMM_DEFAULT_R 4096 #define SYMV_P 8 #endif -#if defined(POWER9) +#if defined(POWER9) || defined(POWER10) #define SNUMOPT 16 #define DNUMOPT 8 @@ -2282,10 +2288,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_Q 1026 #define ZGEMM_DEFAULT_Q 1026 +#define SGEMM_DEFAULT_R 4096 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + #define SYMV_P 8 #endif +#if defined(POWER10) +#undef SHGEMM_DEFAULT_UNROLL_N +#undef SHGEMM_DEFAULT_UNROLL_M +#undef SHGEMM_DEFAULT_P +#undef SHGEMM_DEFAULT_R +#undef SHGEMM_DEFAULT_Q +#define SHGEMM_DEFAULT_UNROLL_M 16 +#define SHGEMM_DEFAULT_UNROLL_N 8 +#define SHGEMM_DEFAULT_P 832 +#define SHGEMM_DEFAULT_Q 1026 +#define SHGEMM_DEFAULT_R 4096 +#endif + #if defined(SPARC) && defined(V7) #define SNUMOPT 4 @@ -2468,7 +2492,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif -#if defined(P5600) || defined(MIPS1004K) || defined(I6400) || defined(P6600) || defined(I6500) +#if defined(P5600) || defined(MIPS1004K) || defined(MIPS24K) || defined(I6400) || defined(P6600) || defined(I6500) #define SNUMOPT 2 #define DNUMOPT 2 @@ -2612,7 +2636,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 -#if defined(CORTEXA53) || defined(CORTEXA57) || \ +#if defined(CORTEXA57) || \ defined(CORTEXA72) || defined(CORTEXA73) || \ defined(FALKOR) || defined(TSV110) || defined(EMAG8180) @@ -2658,6 +2682,35 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 2048 +#elif defined(CORTEXA53) + +#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_N 8 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P 256 +#define DGEMM_DEFAULT_P 160 +#define CGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P 128 + +#define SGEMM_DEFAULT_Q 256 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 112 + +#define SGEMM_DEFAULT_R 4096 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 2048 + #elif defined(THUNDERX) #define SGEMM_DEFAULT_UNROLL_M 4 @@ -2988,7 +3041,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL -#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_M 8 diff --git a/test/Makefile b/test/Makefile index 7a873b7e5..45f9821ec 100644 --- a/test/Makefile +++ b/test/Makefile @@ -64,9 +64,17 @@ endif endif endif +ifeq ($(BUILD_HALF),1) +level3 : test_shgemm sblat3 dblat3 cblat3 zblat3 +else level3 : sblat3 dblat3 cblat3 zblat3 +endif ifndef CROSS rm -f ?BLAT3.SUMM +ifeq ($(BUILD_HALF),1) + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./test_shgemm > SHBLAT3.SUMM + @$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0 +endif OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat3 < ./sblat3.dat @$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0 OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./dblat3 < ./dblat3.dat @@ -78,6 +86,10 @@ ifndef CROSS ifdef SMP rm -f ?BLAT3.SUMM ifeq ($(USE_OPENMP), 1) +ifeq ($(BUILD_HALF),1) + OMP_NUM_THREADS=2 ./test_shgemm > SHBLAT3.SUMM + @$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0 +endif OMP_NUM_THREADS=2 ./sblat3 < ./sblat3.dat @$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0 OMP_NUM_THREADS=2 ./dblat3 < ./dblat3.dat @@ -87,6 +99,10 @@ ifeq ($(USE_OPENMP), 1) OMP_NUM_THREADS=2 ./zblat3 < ./zblat3.dat @$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0 else +ifeq ($(BUILD_HALF),1) + OPENBLAS_NUM_THREADS=2 ./test_shgemm > SHBLAT3.SUMM + @$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0 +endif OPENBLAS_NUM_THREADS=2 ./sblat3 < ./sblat3.dat @$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0 OPENBLAS_NUM_THREADS=2 ./dblat3 < ./dblat3.dat @@ -165,6 +181,11 @@ zblat2 : zblat2.$(SUFFIX) ../$(LIBNAME) sblat3 : sblat3.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o sblat3 sblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) +ifeq ($(BUILD_HALF),1) +test_shgemm : compare_sgemm_shgemm.c ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o test_shgemm compare_sgemm_shgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) +endif + dblat3 : dblat3.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o dblat3 dblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) @@ -187,7 +208,7 @@ clean: @rm -f *.$(SUFFIX) *.$(PSUFFIX) gmon.$(SUFFIX)ut *.SUMM *.cxml *.exe *.pdb *.dwf \ sblat1 dblat1 cblat1 zblat1 \ sblat2 dblat2 cblat2 zblat2 \ - sblat3 dblat3 cblat3 zblat3 \ + test_shgemm sblat3 dblat3 cblat3 zblat3 \ sblat1p dblat1p cblat1p zblat1p \ sblat2p dblat2p cblat2p zblat2p \ sblat3p dblat3p cblat3p zblat3p \ diff --git a/test/cblat1.f b/test/cblat1.f index d6b53d105..ecf2a44cb 100644 --- a/test/cblat1.f +++ b/test/cblat1.f @@ -1,7 +1,49 @@ +*> \brief \b CBLAT1 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* PROGRAM CBLAT1 +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> Test program for the COMPLEX Level 1 BLAS. +*> Based upon the original BLAS test routine together with: +*> +*> F06GAF Example Program Text +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date April 2012 +* +*> \ingroup complex_blas_testing +* +* ===================================================================== PROGRAM CBLAT1 -* Test program for the COMPLEX Level 1 BLAS. -* Based upon the original BLAS test routine together with: -* F06GAF Example Program Text +* +* -- Reference BLAS test routine (version 3.7.0) -- +* -- Reference BLAS is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* April 2012 +* +* ===================================================================== +* * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) @@ -114,8 +156,8 @@ + (5.0E0,6.0E0), (5.0E0,6.0E0), (0.1E0,0.1E0), + (-0.6E0,0.1E0), (0.1E0,-0.3E0), (7.0E0,8.0E0), + (7.0E0,8.0E0), (7.0E0,8.0E0), (7.0E0,8.0E0), - + (7.0E0,8.0E0), (0.3E0,0.1E0), (0.1E0,0.4E0), - + (0.4E0,0.1E0), (0.1E0,0.2E0), (2.0E0,3.0E0), + + (7.0E0,8.0E0), (0.3E0,0.1E0), (0.5E0,0.0E0), + + (0.0E0,0.5E0), (0.0E0,0.2E0), (2.0E0,3.0E0), + (2.0E0,3.0E0), (2.0E0,3.0E0), (2.0E0,3.0E0)/ DATA ((CV(I,J,2),I=1,8),J=1,5)/(0.1E0,0.1E0), + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), @@ -129,10 +171,10 @@ + (3.0E0,6.0E0), (-0.6E0,0.1E0), (4.0E0,7.0E0), + (0.1E0,-0.3E0), (7.0E0,2.0E0), (7.0E0,2.0E0), + (7.0E0,2.0E0), (0.3E0,0.1E0), (5.0E0,8.0E0), - + (0.1E0,0.4E0), (6.0E0,9.0E0), (0.4E0,0.1E0), - + (8.0E0,3.0E0), (0.1E0,0.2E0), (9.0E0,4.0E0)/ - DATA STRUE2/0.0E0, 0.5E0, 0.6E0, 0.7E0, 0.7E0/ - DATA STRUE4/0.0E0, 0.7E0, 1.0E0, 1.3E0, 1.7E0/ + + (0.5E0,0.0E0), (6.0E0,9.0E0), (0.0E0,0.5E0), + + (8.0E0,3.0E0), (0.0E0,0.2E0), (9.0E0,4.0E0)/ + DATA STRUE2/0.0E0, 0.5E0, 0.6E0, 0.7E0, 0.8E0/ + DATA STRUE4/0.0E0, 0.7E0, 1.0E0, 1.3E0, 1.6E0/ DATA ((CTRUE5(I,J,1),I=1,8),J=1,5)/(0.1E0,0.1E0), + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), @@ -145,8 +187,8 @@ + (0.11E0,-0.03E0), (-0.17E0,0.46E0), + (-0.17E0,-0.19E0), (7.0E0,8.0E0), (7.0E0,8.0E0), + (7.0E0,8.0E0), (7.0E0,8.0E0), (7.0E0,8.0E0), - + (0.19E0,-0.17E0), (0.32E0,0.09E0), - + (0.23E0,-0.24E0), (0.18E0,0.01E0), + + (0.19E0,-0.17E0), (0.20E0,-0.35E0), + + (0.35E0,0.20E0), (0.14E0,0.08E0), + (2.0E0,3.0E0), (2.0E0,3.0E0), (2.0E0,3.0E0), + (2.0E0,3.0E0)/ DATA ((CTRUE5(I,J,2),I=1,8),J=1,5)/(0.1E0,0.1E0), @@ -162,9 +204,9 @@ + (-0.17E0,0.46E0), (4.0E0,7.0E0), + (-0.17E0,-0.19E0), (7.0E0,2.0E0), (7.0E0,2.0E0), + (7.0E0,2.0E0), (0.19E0,-0.17E0), (5.0E0,8.0E0), - + (0.32E0,0.09E0), (6.0E0,9.0E0), - + (0.23E0,-0.24E0), (8.0E0,3.0E0), - + (0.18E0,0.01E0), (9.0E0,4.0E0)/ + + (0.20E0,-0.35E0), (6.0E0,9.0E0), + + (0.35E0,0.20E0), (8.0E0,3.0E0), + + (0.14E0,0.08E0), (9.0E0,4.0E0)/ DATA ((CTRUE6(I,J,1),I=1,8),J=1,5)/(0.1E0,0.1E0), + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), @@ -177,8 +219,8 @@ + (0.03E0,0.03E0), (-0.18E0,0.03E0), + (0.03E0,-0.09E0), (7.0E0,8.0E0), (7.0E0,8.0E0), + (7.0E0,8.0E0), (7.0E0,8.0E0), (7.0E0,8.0E0), - + (0.09E0,0.03E0), (0.03E0,0.12E0), - + (0.12E0,0.03E0), (0.03E0,0.06E0), (2.0E0,3.0E0), + + (0.09E0,0.03E0), (0.15E0,0.00E0), + + (0.00E0,0.15E0), (0.00E0,0.06E0), (2.0E0,3.0E0), + (2.0E0,3.0E0), (2.0E0,3.0E0), (2.0E0,3.0E0)/ DATA ((CTRUE6(I,J,2),I=1,8),J=1,5)/(0.1E0,0.1E0), + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), @@ -193,8 +235,8 @@ + (-0.18E0,0.03E0), (4.0E0,7.0E0), + (0.03E0,-0.09E0), (7.0E0,2.0E0), (7.0E0,2.0E0), + (7.0E0,2.0E0), (0.09E0,0.03E0), (5.0E0,8.0E0), - + (0.03E0,0.12E0), (6.0E0,9.0E0), (0.12E0,0.03E0), - + (8.0E0,3.0E0), (0.03E0,0.06E0), (9.0E0,4.0E0)/ + + (0.15E0,0.00E0), (6.0E0,9.0E0), (0.00E0,0.15E0), + + (8.0E0,3.0E0), (0.00E0,0.06E0), (9.0E0,4.0E0)/ DATA ITRUE3/0, 1, 2, 2, 2/ * .. Executable Statements .. DO 60 INCX = 1, 2 @@ -529,7 +571,8 @@ * * .. Parameters .. INTEGER NOUT - PARAMETER (NOUT=6) + REAL ZERO + PARAMETER (NOUT=6, ZERO=0.0E0) * .. Scalar Arguments .. REAL SFAC INTEGER LEN @@ -552,7 +595,7 @@ * DO 40 I = 1, LEN SD = SCOMP(I) - STRUE(I) - IF (SDIFF(ABS(SSIZE(I))+ABS(SFAC*SD),ABS(SSIZE(I))).EQ.0.0E0) + IF (ABS(SFAC*SD) .LE. ABS(SSIZE(I))*EPSILON(ZERO)) + GO TO 40 * * HERE SCOMP(I) IS NOT CLOSE TO STRUE(I). diff --git a/test/cblat2.f b/test/cblat2.f index 20f188100..8c7bac48e 100644 --- a/test/cblat2.f +++ b/test/cblat2.f @@ -1,68 +1,114 @@ +*> \brief \b CBLAT2 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* PROGRAM CBLAT2 +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> Test program for the COMPLEX Level 2 Blas. +*> +*> The program must be driven by a short data file. The first 18 records +*> of the file are read using list-directed input, the last 17 records +*> are read using the format ( A6, L2 ). An annotated example of a data +*> file can be obtained by deleting the first 3 characters from the +*> following 35 lines: +*> 'cblat2.out' NAME OF SUMMARY OUTPUT FILE +*> 6 UNIT NUMBER OF SUMMARY FILE +*> 'CBLA2T.SNAP' NAME OF SNAPSHOT OUTPUT FILE +*> -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +*> F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +*> F LOGICAL FLAG, T TO STOP ON FAILURES. +*> T LOGICAL FLAG, T TO TEST ERROR EXITS. +*> 16.0 THRESHOLD VALUE OF TEST RATIO +*> 6 NUMBER OF VALUES OF N +*> 0 1 2 3 5 9 VALUES OF N +*> 4 NUMBER OF VALUES OF K +*> 0 1 2 4 VALUES OF K +*> 4 NUMBER OF VALUES OF INCX AND INCY +*> 1 2 -1 -2 VALUES OF INCX AND INCY +*> 3 NUMBER OF VALUES OF ALPHA +*> (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +*> 3 NUMBER OF VALUES OF BETA +*> (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +*> CGEMV T PUT F FOR NO TEST. SAME COLUMNS. +*> CGBMV T PUT F FOR NO TEST. SAME COLUMNS. +*> CHEMV T PUT F FOR NO TEST. SAME COLUMNS. +*> CHBMV T PUT F FOR NO TEST. SAME COLUMNS. +*> CHPMV T PUT F FOR NO TEST. SAME COLUMNS. +*> CTRMV T PUT F FOR NO TEST. SAME COLUMNS. +*> CTBMV T PUT F FOR NO TEST. SAME COLUMNS. +*> CTPMV T PUT F FOR NO TEST. SAME COLUMNS. +*> CTRSV T PUT F FOR NO TEST. SAME COLUMNS. +*> CTBSV T PUT F FOR NO TEST. SAME COLUMNS. +*> CTPSV T PUT F FOR NO TEST. SAME COLUMNS. +*> CGERC T PUT F FOR NO TEST. SAME COLUMNS. +*> CGERU T PUT F FOR NO TEST. SAME COLUMNS. +*> CHER T PUT F FOR NO TEST. SAME COLUMNS. +*> CHPR T PUT F FOR NO TEST. SAME COLUMNS. +*> CHER2 T PUT F FOR NO TEST. SAME COLUMNS. +*> CHPR2 T PUT F FOR NO TEST. SAME COLUMNS. +*> +*> Further Details +*> =============== +*> +*> See: +*> +*> Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. +*> An extended set of Fortran Basic Linear Algebra Subprograms. +*> +*> Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics +*> and Computer Science Division, Argonne National Laboratory, +*> 9700 South Cass Avenue, Argonne, Illinois 60439, US. +*> +*> Or +*> +*> NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms +*> Group Ltd., NAG Central Office, 256 Banbury Road, Oxford +*> OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st +*> Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. +*> +*> +*> -- Written on 10-August-1987. +*> Richard Hanson, Sandia National Labs. +*> Jeremy Du Croz, NAG Central Office. +*> +*> 10-9-00: Change STATUS='NEW' to 'UNKNOWN' so that the testers +*> can be run multiple times without deleting generated +*> output files (susan) +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date April 2012 +* +*> \ingroup complex_blas_testing +* +* ===================================================================== PROGRAM CBLAT2 * -* Test program for the COMPLEX Level 2 Blas. +* -- Reference BLAS test routine (version 3.7.0) -- +* -- Reference BLAS is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* April 2012 * -* The program must be driven by a short data file. The first 18 records -* of the file are read using list-directed input, the last 17 records -* are read using the format ( A6, L2 ). An annotated example of a data -* file can be obtained by deleting the first 3 characters from the -* following 35 lines: -* 'CBLAT2.SUMM' NAME OF SUMMARY OUTPUT FILE -* 6 UNIT NUMBER OF SUMMARY FILE -* 'CBLA2T.SNAP' NAME OF SNAPSHOT OUTPUT FILE -* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) -* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO STOP ON FAILURES. -* T LOGICAL FLAG, T TO TEST ERROR EXITS. -* 16.0 THRESHOLD VALUE OF TEST RATIO -* 6 NUMBER OF VALUES OF N -* 0 1 2 3 5 9 VALUES OF N -* 4 NUMBER OF VALUES OF K -* 0 1 2 4 VALUES OF K -* 4 NUMBER OF VALUES OF INCX AND INCY -* 1 2 -1 -2 VALUES OF INCX AND INCY -* 3 NUMBER OF VALUES OF ALPHA -* (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA -* 3 NUMBER OF VALUES OF BETA -* (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA -* CGEMV T PUT F FOR NO TEST. SAME COLUMNS. -* CGBMV T PUT F FOR NO TEST. SAME COLUMNS. -* CHEMV T PUT F FOR NO TEST. SAME COLUMNS. -* CHBMV T PUT F FOR NO TEST. SAME COLUMNS. -* CHPMV T PUT F FOR NO TEST. SAME COLUMNS. -* CTRMV T PUT F FOR NO TEST. SAME COLUMNS. -* CTBMV T PUT F FOR NO TEST. SAME COLUMNS. -* CTPMV T PUT F FOR NO TEST. SAME COLUMNS. -* CTRSV T PUT F FOR NO TEST. SAME COLUMNS. -* CTBSV T PUT F FOR NO TEST. SAME COLUMNS. -* CTPSV T PUT F FOR NO TEST. SAME COLUMNS. -* CGERC T PUT F FOR NO TEST. SAME COLUMNS. -* CGERU T PUT F FOR NO TEST. SAME COLUMNS. -* CHER T PUT F FOR NO TEST. SAME COLUMNS. -* CHPR T PUT F FOR NO TEST. SAME COLUMNS. -* CHER2 T PUT F FOR NO TEST. SAME COLUMNS. -* CHPR2 T PUT F FOR NO TEST. SAME COLUMNS. -* -* See: -* -* Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. -* An extended set of Fortran Basic Linear Algebra Subprograms. -* -* Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics -* and Computer Science Division, Argonne National Laboratory, -* 9700 South Cass Avenue, Argonne, Illinois 60439, US. -* -* Or -* -* NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms -* Group Ltd., NAG Central Office, 256 Banbury Road, Oxford -* OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st -* Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. -* -* -* -- Written on 10-August-1987. -* Richard Hanson, Sandia National Labs. -* Jeremy Du Croz, NAG Central Office. +* ===================================================================== * * .. Parameters .. INTEGER NIN @@ -71,8 +117,8 @@ PARAMETER ( NSUBS = 17 ) COMPLEX ZERO, ONE PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) - REAL RZERO, RHALF, RONE - PARAMETER ( RZERO = 0.0, RHALF = 0.5, RONE = 1.0 ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) INTEGER NMAX, INCMAX PARAMETER ( NMAX = 65, INCMAX = 2 ) INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX @@ -126,7 +172,7 @@ * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT - OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) + OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. @@ -135,7 +181,7 @@ READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN - OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI @@ -240,14 +286,7 @@ * * Compute EPS (the machine precision). * - EPS = RONE - 90 CONTINUE - IF( SDIFF( RONE + EPS, RONE ).EQ.RZERO ) - $ GO TO 100 - EPS = RHALF*EPS - GO TO 90 - 100 CONTINUE - EPS = EPS + EPS + EPS = EPSILON(RZERO) WRITE( NOUT, FMT = 9998 )EPS * * Check the reliability of CMVCH using exact data. @@ -3079,7 +3118,6 @@ 50 CONTINUE END IF * - 60 CONTINUE LCERES = .TRUE. GO TO 80 70 CONTINUE diff --git a/test/cblat3.f b/test/cblat3.f index 5df1ddd64..a65e1364c 100644 --- a/test/cblat3.f +++ b/test/cblat3.f @@ -1,50 +1,96 @@ +*> \brief \b CBLAT3 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* PROGRAM CBLAT3 +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> Test program for the COMPLEX Level 3 Blas. +*> +*> The program must be driven by a short data file. The first 14 records +*> of the file are read using list-directed input, the last 9 records +*> are read using the format ( A6, L2 ). An annotated example of a data +*> file can be obtained by deleting the first 3 characters from the +*> following 23 lines: +*> 'cblat3.out' NAME OF SUMMARY OUTPUT FILE +*> 6 UNIT NUMBER OF SUMMARY FILE +*> 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +*> -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +*> F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +*> F LOGICAL FLAG, T TO STOP ON FAILURES. +*> T LOGICAL FLAG, T TO TEST ERROR EXITS. +*> 16.0 THRESHOLD VALUE OF TEST RATIO +*> 6 NUMBER OF VALUES OF N +*> 0 1 2 3 5 9 VALUES OF N +*> 3 NUMBER OF VALUES OF ALPHA +*> (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +*> 3 NUMBER OF VALUES OF BETA +*> (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +*> CGEMM T PUT F FOR NO TEST. SAME COLUMNS. +*> CHEMM T PUT F FOR NO TEST. SAME COLUMNS. +*> CSYMM T PUT F FOR NO TEST. SAME COLUMNS. +*> CTRMM T PUT F FOR NO TEST. SAME COLUMNS. +*> CTRSM T PUT F FOR NO TEST. SAME COLUMNS. +*> CHERK T PUT F FOR NO TEST. SAME COLUMNS. +*> CSYRK T PUT F FOR NO TEST. SAME COLUMNS. +*> CHER2K T PUT F FOR NO TEST. SAME COLUMNS. +*> CSYR2K T PUT F FOR NO TEST. SAME COLUMNS. +*> +*> Further Details +*> =============== +*> +*> See: +*> +*> Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. +*> A Set of Level 3 Basic Linear Algebra Subprograms. +*> +*> Technical Memorandum No.88 (Revision 1), Mathematics and +*> Computer Science Division, Argonne National Laboratory, 9700 +*> South Cass Avenue, Argonne, Illinois 60439, US. +*> +*> -- Written on 8-February-1989. +*> Jack Dongarra, Argonne National Laboratory. +*> Iain Duff, AERE Harwell. +*> Jeremy Du Croz, Numerical Algorithms Group Ltd. +*> Sven Hammarling, Numerical Algorithms Group Ltd. +*> +*> 10-9-00: Change STATUS='NEW' to 'UNKNOWN' so that the testers +*> can be run multiple times without deleting generated +*> output files (susan) +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date April 2012 +* +*> \ingroup complex_blas_testing +* +* ===================================================================== PROGRAM CBLAT3 * -* Test program for the COMPLEX Level 3 Blas. +* -- Reference BLAS test routine (version 3.7.0) -- +* -- Reference BLAS is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* April 2012 * -* The program must be driven by a short data file. The first 14 records -* of the file are read using list-directed input, the last 9 records -* are read using the format ( A6, L2 ). An annotated example of a data -* file can be obtained by deleting the first 3 characters from the -* following 23 lines: -* 'CBLAT3.SUMM' NAME OF SUMMARY OUTPUT FILE -* 6 UNIT NUMBER OF SUMMARY FILE -* 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE -* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) -* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO STOP ON FAILURES. -* T LOGICAL FLAG, T TO TEST ERROR EXITS. -* 16.0 THRESHOLD VALUE OF TEST RATIO -* 6 NUMBER OF VALUES OF N -* 0 1 2 3 5 9 VALUES OF N -* 3 NUMBER OF VALUES OF ALPHA -* (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA -* 3 NUMBER OF VALUES OF BETA -* (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA -* CGEMM T PUT F FOR NO TEST. SAME COLUMNS. -* CHEMM T PUT F FOR NO TEST. SAME COLUMNS. -* CSYMM T PUT F FOR NO TEST. SAME COLUMNS. -* CTRMM T PUT F FOR NO TEST. SAME COLUMNS. -* CTRSM T PUT F FOR NO TEST. SAME COLUMNS. -* CHERK T PUT F FOR NO TEST. SAME COLUMNS. -* CSYRK T PUT F FOR NO TEST. SAME COLUMNS. -* CHER2K T PUT F FOR NO TEST. SAME COLUMNS. -* CSYR2K T PUT F FOR NO TEST. SAME COLUMNS. -* -* See: -* -* Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. -* A Set of Level 3 Basic Linear Algebra Subprograms. -* -* Technical Memorandum No.88 (Revision 1), Mathematics and -* Computer Science Division, Argonne National Laboratory, 9700 -* South Cass Avenue, Argonne, Illinois 60439, US. -* -* -- Written on 8-February-1989. -* Jack Dongarra, Argonne National Laboratory. -* Iain Duff, AERE Harwell. -* Jeremy Du Croz, Numerical Algorithms Group Ltd. -* Sven Hammarling, Numerical Algorithms Group Ltd. +* ===================================================================== * * .. Parameters .. INTEGER NIN @@ -53,8 +99,8 @@ PARAMETER ( NSUBS = 9 ) COMPLEX ZERO, ONE PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) - REAL RZERO, RHALF, RONE - PARAMETER ( RZERO = 0.0, RHALF = 0.5, RONE = 1.0 ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) INTEGER NMAX PARAMETER ( NMAX = 65 ) INTEGER NIDMAX, NALMAX, NBEMAX @@ -103,7 +149,7 @@ * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT - OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) + OPEN( NOUT, FILE = SUMMRY ) NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. @@ -112,7 +158,7 @@ READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN - OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + OPEN( NTRA, FILE = SNAPS ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI @@ -189,14 +235,7 @@ * * Compute EPS (the machine precision). * - EPS = RONE - 70 CONTINUE - IF( SDIFF( RONE + EPS, RONE ).EQ.RZERO ) - $ GO TO 80 - EPS = RHALF*EPS - GO TO 70 - 80 CONTINUE - EPS = EPS + EPS + EPS = EPSILON(RZERO) WRITE( NOUT, FMT = 9998 )EPS * * Check the reliability of CMMCH using exact data. @@ -1301,8 +1340,6 @@ NC = 0 RESET = .TRUE. ERRMAX = RZERO - RALS = RONE - RBETS = RONE * DO 100 IN = 1, NIDIM N = IDIM( IN ) @@ -1948,7 +1985,7 @@ * * Tests the error exits from the Level 3 Blas. * Requires a special version of the error-handling routine XERBLA. -* ALPHA, RALPHA, BETA, RBETA, A, B and C should not need to be defined. +* A, B and C should not need to be defined. * * Auxiliary routine for test program for Level 3 Blas. * @@ -1958,12 +1995,19 @@ * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * +* 3-19-92: Initialize ALPHA, BETA, RALPHA, and RBETA (eca) +* 3-19-92: Fix argument 12 in calls to CSYMM and CHEMM +* with INFOT = 9 (eca) +* * .. Scalar Arguments .. INTEGER ISNUM, NOUT CHARACTER*6 SRNAMT * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK +* .. Parameters .. + REAL ONE, TWO + PARAMETER ( ONE = 1.0E0, TWO = 2.0E0 ) * .. Local Scalars .. COMPLEX ALPHA, BETA REAL RALPHA, RBETA @@ -1981,6 +2025,14 @@ * LERR is set to .TRUE. by the special version of XERBLA each time * it is called, and is then tested and re-set by CHKXER. LERR = .FALSE. +* +* Initialize ALPHA, BETA, RALPHA, and RBETA. +* + ALPHA = CMPLX( ONE, -ONE ) + BETA = CMPLX( TWO, -TWO ) + RALPHA = ONE + RBETA = TWO +* GO TO ( 10, 20, 30, 40, 50, 60, 70, 80, $ 90 )ISNUM 10 INFOT = 1 @@ -2207,16 +2259,16 @@ CALL CHEMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL CHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL CHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL CHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL CHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL CHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) @@ -2274,16 +2326,16 @@ CALL CSYMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL CSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL CSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL CSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL CSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL CSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) @@ -3270,7 +3322,6 @@ 50 CONTINUE END IF * - 60 CONTINUE LCERES = .TRUE. GO TO 80 70 CONTINUE diff --git a/test/compare_sgemm_shgemm.c b/test/compare_sgemm_shgemm.c new file mode 100644 index 000000000..57aee7b8f --- /dev/null +++ b/test/compare_sgemm_shgemm.c @@ -0,0 +1,131 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#include +#include +#include "../common.h" +#define SGEMM BLASFUNC(sgemm) +#define SHGEMM BLASFUNC(shgemm) +typedef union +{ + unsigned short v; + struct + { +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + unsigned short s:1; + unsigned short e:8; + unsigned short m:7; +#else + unsigned short m:7; + unsigned short e:8; + unsigned short s:1; +#endif + } bits; +} bfloat16_bits; + +typedef union +{ + float v; + struct + { + uint32_t m:23; + uint32_t e:8; + uint32_t s:1; + } bits; +} float32_bits; + +float +float16to32 (bfloat16_bits f16) +{ + float32_bits f32; + f32.bits.s = f16.bits.s; + f32.bits.e = f16.bits.e; + f32.bits.m = (uint32_t) f16.bits.m << 16; + return f32.v; +} + +int +main (int argc, char *argv[]) +{ + int m, n, k; + int i, j, l; + int x; + int ret = 0; + int loop = 100; + char transA = 'N', transB = 'N'; + float alpha = 1.0, beta = 0.0; + + for (x = 0; x <= loop; x++) + { + m = k = n = x; + float A[m * k]; + float B[k * n]; + float C[m * n]; + bfloat16_bits AA[m * k], BB[k * n]; + float DD[m * n], CC[m * n]; + + for (j = 0; j < m; j++) + { + for (i = 0; i < m; i++) + { + A[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; + B[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; + C[j * k + i] = 0; + AA[j * k + i].v = *(uint32_t *) & A[j * k + i] >> 16; + BB[j * k + i].v = *(uint32_t *) & B[j * k + i] >> 16; + CC[j * k + i] = 0; + DD[j * k + i] = 0; + } + } + SGEMM (&transA, &transB, &m, &n, &k, &alpha, A, + &m, B, &k, &beta, C, &m); + SHGEMM (&transA, &transB, &m, &n, &k, &alpha, AA, + &m, BB, &k, &beta, CC, &m); + for (i = 0; i < n; i++) + for (j = 0; j < m; j++) + for (l = 0; l < k; l++) + if (fabs (CC[i * m + j] - C[i * m + j]) > 1.0) + ret++; + if (transA == 'N' && transB == 'N') + { + for (i = 0; i < n; i++) + for (j = 0; j < m; j++) + for (l = 0; l < k; l++) + { + DD[i * m + j] += + float16to32 (AA[l * m + j]) * float16to32 (BB[l + k * i]); + } + for (i = 0; i < n; i++) + for (j = 0; j < m; j++) + for (l = 0; l < k; l++) + if (CC[i * m + j] != DD[i * m + j]) + ret++; + } + } + if (ret != 0) + fprintf (stderr, "FATAL ERROR SHGEMM - Return code: %d\n", ret); + return ret; +} diff --git a/test/dblat2.f b/test/dblat2.f index 4002d4368..9bbbe9792 100644 --- a/test/dblat2.f +++ b/test/dblat2.f @@ -1,75 +1,121 @@ +*> \brief \b DBLAT2 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* PROGRAM DBLAT2 +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> Test program for the DOUBLE PRECISION Level 2 Blas. +*> +*> The program must be driven by a short data file. The first 18 records +*> of the file are read using list-directed input, the last 16 records +*> are read using the format ( A6, L2 ). An annotated example of a data +*> file can be obtained by deleting the first 3 characters from the +*> following 34 lines: +*> 'dblat2.out' NAME OF SUMMARY OUTPUT FILE +*> 6 UNIT NUMBER OF SUMMARY FILE +*> 'DBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE +*> -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +*> F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +*> F LOGICAL FLAG, T TO STOP ON FAILURES. +*> T LOGICAL FLAG, T TO TEST ERROR EXITS. +*> 16.0 THRESHOLD VALUE OF TEST RATIO +*> 6 NUMBER OF VALUES OF N +*> 0 1 2 3 5 9 VALUES OF N +*> 4 NUMBER OF VALUES OF K +*> 0 1 2 4 VALUES OF K +*> 4 NUMBER OF VALUES OF INCX AND INCY +*> 1 2 -1 -2 VALUES OF INCX AND INCY +*> 3 NUMBER OF VALUES OF ALPHA +*> 0.0 1.0 0.7 VALUES OF ALPHA +*> 3 NUMBER OF VALUES OF BETA +*> 0.0 1.0 0.9 VALUES OF BETAC +*> DGEMV T PUT F FOR NO TEST. SAME COLUMNS. +*> DGBMV T PUT F FOR NO TEST. SAME COLUMNS. +*> DSYMV T PUT F FOR NO TEST. SAME COLUMNS. +*> DSBMV T PUT F FOR NO TEST. SAME COLUMNS. +*> DSPMV T PUT F FOR NO TEST. SAME COLUMNS. +*> DTRMV T PUT F FOR NO TEST. SAME COLUMNS. +*> DTBMV T PUT F FOR NO TEST. SAME COLUMNS. +*> DTPMV T PUT F FOR NO TEST. SAME COLUMNS. +*> DTRSV T PUT F FOR NO TEST. SAME COLUMNS. +*> DTBSV T PUT F FOR NO TEST. SAME COLUMNS. +*> DTPSV T PUT F FOR NO TEST. SAME COLUMNS. +*> DGER T PUT F FOR NO TEST. SAME COLUMNS. +*> DSYR T PUT F FOR NO TEST. SAME COLUMNS. +*> DSPR T PUT F FOR NO TEST. SAME COLUMNS. +*> DSYR2 T PUT F FOR NO TEST. SAME COLUMNS. +*> DSPR2 T PUT F FOR NO TEST. SAME COLUMNS. +*> +*> Further Details +*> =============== +*> +*> See: +*> +*> Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. +*> An extended set of Fortran Basic Linear Algebra Subprograms. +*> +*> Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics +*> and Computer Science Division, Argonne National Laboratory, +*> 9700 South Cass Avenue, Argonne, Illinois 60439, US. +*> +*> Or +*> +*> NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms +*> Group Ltd., NAG Central Office, 256 Banbury Road, Oxford +*> OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st +*> Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. +*> +*> +*> -- Written on 10-August-1987. +*> Richard Hanson, Sandia National Labs. +*> Jeremy Du Croz, NAG Central Office. +*> +*> 10-9-00: Change STATUS='NEW' to 'UNKNOWN' so that the testers +*> can be run multiple times without deleting generated +*> output files (susan) +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date April 2012 +* +*> \ingroup double_blas_testing +* +* ===================================================================== PROGRAM DBLAT2 * -* Test program for the DOUBLE PRECISION Level 2 Blas. +* -- Reference BLAS test routine (version 3.7.0) -- +* -- Reference BLAS is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* April 2012 * -* The program must be driven by a short data file. The first 18 records -* of the file are read using list-directed input, the last 16 records -* are read using the format ( A6, L2 ). An annotated example of a data -* file can be obtained by deleting the first 3 characters from the -* following 34 lines: -* 'DBLAT2.SUMM' NAME OF SUMMARY OUTPUT FILE -* 6 UNIT NUMBER OF SUMMARY FILE -* 'DBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE -* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) -* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO STOP ON FAILURES. -* T LOGICAL FLAG, T TO TEST ERROR EXITS. -* 16.0 THRESHOLD VALUE OF TEST RATIO -* 6 NUMBER OF VALUES OF N -* 0 1 2 3 5 9 VALUES OF N -* 4 NUMBER OF VALUES OF K -* 0 1 2 4 VALUES OF K -* 4 NUMBER OF VALUES OF INCX AND INCY -* 1 2 -1 -2 VALUES OF INCX AND INCY -* 3 NUMBER OF VALUES OF ALPHA -* 0.0 1.0 0.7 VALUES OF ALPHA -* 3 NUMBER OF VALUES OF BETA -* 0.0 1.0 0.9 VALUES OF BETA -* DGEMV T PUT F FOR NO TEST. SAME COLUMNS. -* DGBMV T PUT F FOR NO TEST. SAME COLUMNS. -* DSYMV T PUT F FOR NO TEST. SAME COLUMNS. -* DSBMV T PUT F FOR NO TEST. SAME COLUMNS. -* DSPMV T PUT F FOR NO TEST. SAME COLUMNS. -* DTRMV T PUT F FOR NO TEST. SAME COLUMNS. -* DTBMV T PUT F FOR NO TEST. SAME COLUMNS. -* DTPMV T PUT F FOR NO TEST. SAME COLUMNS. -* DTRSV T PUT F FOR NO TEST. SAME COLUMNS. -* DTBSV T PUT F FOR NO TEST. SAME COLUMNS. -* DTPSV T PUT F FOR NO TEST. SAME COLUMNS. -* DGER T PUT F FOR NO TEST. SAME COLUMNS. -* DSYR T PUT F FOR NO TEST. SAME COLUMNS. -* DSPR T PUT F FOR NO TEST. SAME COLUMNS. -* DSYR2 T PUT F FOR NO TEST. SAME COLUMNS. -* DSPR2 T PUT F FOR NO TEST. SAME COLUMNS. -* -* See: -* -* Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. -* An extended set of Fortran Basic Linear Algebra Subprograms. -* -* Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics -* and Computer Science Division, Argonne National Laboratory, -* 9700 South Cass Avenue, Argonne, Illinois 60439, US. -* -* Or -* -* NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms -* Group Ltd., NAG Central Office, 256 Banbury Road, Oxford -* OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st -* Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. -* -* -* -- Written on 10-August-1987. -* Richard Hanson, Sandia National Labs. -* Jeremy Du Croz, NAG Central Office. +* ===================================================================== * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 5 ) INTEGER NSUBS PARAMETER ( NSUBS = 16 ) - DOUBLE PRECISION ZERO, HALF, ONE - PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) INTEGER NMAX, INCMAX PARAMETER ( NMAX = 65, INCMAX = 2 ) INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX @@ -121,7 +167,7 @@ * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT - OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) + OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. @@ -130,7 +176,7 @@ READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN - OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI @@ -235,14 +281,7 @@ * * Compute EPS (the machine precision). * - EPS = ONE - 90 CONTINUE - IF( DDIFF( ONE + EPS, ONE ).EQ.ZERO ) - $ GO TO 100 - EPS = HALF*EPS - GO TO 90 - 100 CONTINUE - EPS = EPS + EPS + EPS = EPSILON(ZERO) WRITE( NOUT, FMT = 9998 )EPS * * Check the reliability of DMVCH using exact data. @@ -2982,7 +3021,6 @@ 50 CONTINUE END IF * - 60 CONTINUE LDERES = .TRUE. GO TO 80 70 CONTINUE diff --git a/test/dblat3.f b/test/dblat3.f index 082e03e5e..1ebec4ffa 100644 --- a/test/dblat3.f +++ b/test/dblat3.f @@ -1,55 +1,101 @@ +*> \brief \b DBLAT3 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* PROGRAM DBLAT3 +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> Test program for the DOUBLE PRECISION Level 3 Blas. +*> +*> The program must be driven by a short data file. The first 14 records +*> of the file are read using list-directed input, the last 6 records +*> are read using the format ( A6, L2 ). An annotated example of a data +*> file can be obtained by deleting the first 3 characters from the +*> following 20 lines: +*> 'dblat3.out' NAME OF SUMMARY OUTPUT FILE +*> 6 UNIT NUMBER OF SUMMARY FILE +*> 'DBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +*> -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +*> F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +*> F LOGICAL FLAG, T TO STOP ON FAILURES. +*> T LOGICAL FLAG, T TO TEST ERROR EXITS. +*> 16.0 THRESHOLD VALUE OF TEST RATIO +*> 6 NUMBER OF VALUES OF N +*> 0 1 2 3 5 9 VALUES OF N +*> 3 NUMBER OF VALUES OF ALPHA +*> 0.0 1.0 0.7 VALUES OF ALPHA +*> 3 NUMBER OF VALUES OF BETA +*> 0.0 1.0 1.3 VALUES OF BETA +*> DGEMM T PUT F FOR NO TEST. SAME COLUMNS. +*> DSYMM T PUT F FOR NO TEST. SAME COLUMNS. +*> DTRMM T PUT F FOR NO TEST. SAME COLUMNS. +*> DTRSM T PUT F FOR NO TEST. SAME COLUMNS. +*> DSYRK T PUT F FOR NO TEST. SAME COLUMNS. +*> DSYR2K T PUT F FOR NO TEST. SAME COLUMNS. +*> +*> Further Details +*> =============== +*> +*> See: +*> +*> Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. +*> A Set of Level 3 Basic Linear Algebra Subprograms. +*> +*> Technical Memorandum No.88 (Revision 1), Mathematics and +*> Computer Science Division, Argonne National Laboratory, 9700 +*> South Cass Avenue, Argonne, Illinois 60439, US. +*> +*> -- Written on 8-February-1989. +*> Jack Dongarra, Argonne National Laboratory. +*> Iain Duff, AERE Harwell. +*> Jeremy Du Croz, Numerical Algorithms Group Ltd. +*> Sven Hammarling, Numerical Algorithms Group Ltd. +*> +*> 10-9-00: Change STATUS='NEW' to 'UNKNOWN' so that the testers +*> can be run multiple times without deleting generated +*> output files (susan) +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date April 2012 +* +*> \ingroup double_blas_testing +* +* ===================================================================== PROGRAM DBLAT3 * -* Test program for the DOUBLE PRECISION Level 3 Blas. +* -- Reference BLAS test routine (version 3.7.0) -- +* -- Reference BLAS is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* April 2012 * -* The program must be driven by a short data file. The first 14 records -* of the file are read using list-directed input, the last 6 records -* are read using the format ( A6, L2 ). An annotated example of a data -* file can be obtained by deleting the first 3 characters from the -* following 20 lines: -* 'DBLAT3.SUMM' NAME OF SUMMARY OUTPUT FILE -* 6 UNIT NUMBER OF SUMMARY FILE -* 'DBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE -* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) -* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO STOP ON FAILURES. -* T LOGICAL FLAG, T TO TEST ERROR EXITS. -* 16.0 THRESHOLD VALUE OF TEST RATIO -* 6 NUMBER OF VALUES OF N -* 0 1 2 3 5 9 VALUES OF N -* 3 NUMBER OF VALUES OF ALPHA -* 0.0 1.0 0.7 VALUES OF ALPHA -* 3 NUMBER OF VALUES OF BETA -* 0.0 1.0 1.3 VALUES OF BETA -* DGEMM T PUT F FOR NO TEST. SAME COLUMNS. -* DSYMM T PUT F FOR NO TEST. SAME COLUMNS. -* DTRMM T PUT F FOR NO TEST. SAME COLUMNS. -* DTRSM T PUT F FOR NO TEST. SAME COLUMNS. -* DSYRK T PUT F FOR NO TEST. SAME COLUMNS. -* DSYR2K T PUT F FOR NO TEST. SAME COLUMNS. -* -* See: -* -* Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. -* A Set of Level 3 Basic Linear Algebra Subprograms. -* -* Technical Memorandum No.88 (Revision 1), Mathematics and -* Computer Science Division, Argonne National Laboratory, 9700 -* South Cass Avenue, Argonne, Illinois 60439, US. -* -* -- Written on 8-February-1989. -* Jack Dongarra, Argonne National Laboratory. -* Iain Duff, AERE Harwell. -* Jeremy Du Croz, Numerical Algorithms Group Ltd. -* Sven Hammarling, Numerical Algorithms Group Ltd. +* ===================================================================== * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 5 ) INTEGER NSUBS PARAMETER ( NSUBS = 6 ) - DOUBLE PRECISION ZERO, HALF, ONE - PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) INTEGER NMAX PARAMETER ( NMAX = 65 ) INTEGER NIDMAX, NALMAX, NBEMAX @@ -96,7 +142,7 @@ * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT - OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) + OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. @@ -105,7 +151,7 @@ READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN - OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI @@ -182,14 +228,7 @@ * * Compute EPS (the machine precision). * - EPS = ONE - 70 CONTINUE - IF( DDIFF( ONE + EPS, ONE ).EQ.ZERO ) - $ GO TO 80 - EPS = HALF*EPS - GO TO 70 - 80 CONTINUE - EPS = EPS + EPS + EPS = EPSILON(ZERO) WRITE( NOUT, FMT = 9998 )EPS * * Check the reliability of DMMCH using exact data. @@ -1802,7 +1841,7 @@ * * Tests the error exits from the Level 3 Blas. * Requires a special version of the error-handling routine XERBLA. -* ALPHA, BETA, A, B and C should not need to be defined. +* A, B and C should not need to be defined. * * Auxiliary routine for test program for Level 3 Blas. * @@ -1812,12 +1851,18 @@ * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * +* 3-19-92: Initialize ALPHA and BETA (eca) +* 3-19-92: Fix argument 12 in calls to SSYMM with INFOT = 9 (eca) +* * .. Scalar Arguments .. INTEGER ISNUM, NOUT CHARACTER*6 SRNAMT * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK +* .. Parameters .. + DOUBLE PRECISION ONE, TWO + PARAMETER ( ONE = 1.0D0, TWO = 2.0D0 ) * .. Local Scalars .. DOUBLE PRECISION ALPHA, BETA * .. Local Arrays .. @@ -1834,6 +1879,12 @@ * LERR is set to .TRUE. by the special version of XERBLA each time * it is called, and is then tested and re-set by CHKXER. LERR = .FALSE. +* +* Initialize ALPHA and BETA. +* + ALPHA = ONE + BETA = TWO +* GO TO ( 10, 20, 30, 40, 50, 60 )ISNUM 10 INFOT = 1 CALL DGEMM( '/', 'N', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) @@ -1963,16 +2014,16 @@ CALL DSYMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL DSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL DSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL DSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL DSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL DSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL DSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL DSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL DSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL DSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) @@ -2660,7 +2711,6 @@ 50 CONTINUE END IF * - 60 CONTINUE LDERES = .TRUE. GO TO 80 70 CONTINUE diff --git a/test/sblat2.f b/test/sblat2.f index a1074be52..56ead8640 100644 --- a/test/sblat2.f +++ b/test/sblat2.f @@ -1,75 +1,121 @@ +*> \brief \b SBLAT2 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* PROGRAM SBLAT2 +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> Test program for the REAL Level 2 Blas. +*> +*> The program must be driven by a short data file. The first 18 records +*> of the file are read using list-directed input, the last 16 records +*> are read using the format ( A6, L2 ). An annotated example of a data +*> file can be obtained by deleting the first 3 characters from the +*> following 34 lines: +*> 'sblat2.out' NAME OF SUMMARY OUTPUT FILE +*> 6 UNIT NUMBER OF SUMMARY FILE +*> 'SBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE +*> -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +*> F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +*> F LOGICAL FLAG, T TO STOP ON FAILURES. +*> T LOGICAL FLAG, T TO TEST ERROR EXITS. +*> 16.0 THRESHOLD VALUE OF TEST RATIO +*> 6 NUMBER OF VALUES OF N +*> 0 1 2 3 5 9 VALUES OF N +*> 4 NUMBER OF VALUES OF K +*> 0 1 2 4 VALUES OF K +*> 4 NUMBER OF VALUES OF INCX AND INCY +*> 1 2 -1 -2 VALUES OF INCX AND INCY +*> 3 NUMBER OF VALUES OF ALPHA +*> 0.0 1.0 0.7 VALUES OF ALPHA +*> 3 NUMBER OF VALUES OF BETA +*> 0.0 1.0 0.9 VALUES OF BETA +*> SGEMV T PUT F FOR NO TEST. SAME COLUMNS. +*> SGBMV T PUT F FOR NO TEST. SAME COLUMNS. +*> SSYMV T PUT F FOR NO TEST. SAME COLUMNS. +*> SSBMV T PUT F FOR NO TEST. SAME COLUMNS. +*> SSPMV T PUT F FOR NO TEST. SAME COLUMNS. +*> STRMV T PUT F FOR NO TEST. SAME COLUMNS. +*> STBMV T PUT F FOR NO TEST. SAME COLUMNS. +*> STPMV T PUT F FOR NO TEST. SAME COLUMNS. +*> STRSV T PUT F FOR NO TEST. SAME COLUMNS. +*> STBSV T PUT F FOR NO TEST. SAME COLUMNS. +*> STPSV T PUT F FOR NO TEST. SAME COLUMNS. +*> SGER T PUT F FOR NO TEST. SAME COLUMNS. +*> SSYR T PUT F FOR NO TEST. SAME COLUMNS. +*> SSPR T PUT F FOR NO TEST. SAME COLUMNS. +*> SSYR2 T PUT F FOR NO TEST. SAME COLUMNS. +*> SSPR2 T PUT F FOR NO TEST. SAME COLUMNS. +*> +*> Further Details +*> =============== +*> +*> See: +*> +*> Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. +*> An extended set of Fortran Basic Linear Algebra Subprograms. +*> +*> Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics +*> and Computer Science Division, Argonne National Laboratory, +*> 9700 South Cass Avenue, Argonne, Illinois 60439, US. +*> +*> Or +*> +*> NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms +*> Group Ltd., NAG Central Office, 256 Banbury Road, Oxford +*> OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st +*> Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. +*> +*> +*> -- Written on 10-August-1987. +*> Richard Hanson, Sandia National Labs. +*> Jeremy Du Croz, NAG Central Office. +*> +*> 10-9-00: Change STATUS='NEW' to 'UNKNOWN' so that the testers +*> can be run multiple times without deleting generated +*> output files (susan) +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date April 2012 +* +*> \ingroup single_blas_testing +* +* ===================================================================== PROGRAM SBLAT2 * -* Test program for the REAL Level 2 Blas. +* -- Reference BLAS test routine (version 3.7.0) -- +* -- Reference BLAS is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* April 2012 * -* The program must be driven by a short data file. The first 18 records -* of the file are read using list-directed input, the last 16 records -* are read using the format ( A6, L2 ). An annotated example of a data -* file can be obtained by deleting the first 3 characters from the -* following 34 lines: -* 'SBLAT2.SUMM' NAME OF SUMMARY OUTPUT FILE -* 6 UNIT NUMBER OF SUMMARY FILE -* 'SBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE -* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) -* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO STOP ON FAILURES. -* T LOGICAL FLAG, T TO TEST ERROR EXITS. -* 16.0 THRESHOLD VALUE OF TEST RATIO -* 6 NUMBER OF VALUES OF N -* 0 1 2 3 5 9 VALUES OF N -* 4 NUMBER OF VALUES OF K -* 0 1 2 4 VALUES OF K -* 4 NUMBER OF VALUES OF INCX AND INCY -* 1 2 -1 -2 VALUES OF INCX AND INCY -* 3 NUMBER OF VALUES OF ALPHA -* 0.0 1.0 0.7 VALUES OF ALPHA -* 3 NUMBER OF VALUES OF BETA -* 0.0 1.0 0.9 VALUES OF BETA -* SGEMV T PUT F FOR NO TEST. SAME COLUMNS. -* SGBMV T PUT F FOR NO TEST. SAME COLUMNS. -* SSYMV T PUT F FOR NO TEST. SAME COLUMNS. -* SSBMV T PUT F FOR NO TEST. SAME COLUMNS. -* SSPMV T PUT F FOR NO TEST. SAME COLUMNS. -* STRMV T PUT F FOR NO TEST. SAME COLUMNS. -* STBMV T PUT F FOR NO TEST. SAME COLUMNS. -* STPMV T PUT F FOR NO TEST. SAME COLUMNS. -* STRSV T PUT F FOR NO TEST. SAME COLUMNS. -* STBSV T PUT F FOR NO TEST. SAME COLUMNS. -* STPSV T PUT F FOR NO TEST. SAME COLUMNS. -* SGER T PUT F FOR NO TEST. SAME COLUMNS. -* SSYR T PUT F FOR NO TEST. SAME COLUMNS. -* SSPR T PUT F FOR NO TEST. SAME COLUMNS. -* SSYR2 T PUT F FOR NO TEST. SAME COLUMNS. -* SSPR2 T PUT F FOR NO TEST. SAME COLUMNS. -* -* See: -* -* Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. -* An extended set of Fortran Basic Linear Algebra Subprograms. -* -* Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics -* and Computer Science Division, Argonne National Laboratory, -* 9700 South Cass Avenue, Argonne, Illinois 60439, US. -* -* Or -* -* NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms -* Group Ltd., NAG Central Office, 256 Banbury Road, Oxford -* OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st -* Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. -* -* -* -- Written on 10-August-1987. -* Richard Hanson, Sandia National Labs. -* Jeremy Du Croz, NAG Central Office. +* ===================================================================== * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 5 ) INTEGER NSUBS PARAMETER ( NSUBS = 16 ) - REAL ZERO, HALF, ONE - PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0, ONE = 1.0 ) INTEGER NMAX, INCMAX PARAMETER ( NMAX = 65, INCMAX = 2 ) INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX @@ -121,7 +167,7 @@ * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT - OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) + OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. @@ -130,7 +176,7 @@ READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN - OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI @@ -235,14 +281,7 @@ * * Compute EPS (the machine precision). * - EPS = ONE - 90 CONTINUE - IF( SDIFF( ONE + EPS, ONE ).EQ.ZERO ) - $ GO TO 100 - EPS = HALF*EPS - GO TO 90 - 100 CONTINUE - EPS = EPS + EPS + EPS = EPSILON(ZERO) WRITE( NOUT, FMT = 9998 )EPS * * Check the reliability of SMVCH using exact data. @@ -2982,7 +3021,6 @@ 50 CONTINUE END IF * - 60 CONTINUE LSERES = .TRUE. GO TO 80 70 CONTINUE diff --git a/test/sblat3.f b/test/sblat3.f index 325a9eb92..66edac14e 100644 --- a/test/sblat3.f +++ b/test/sblat3.f @@ -1,55 +1,101 @@ +*> \brief \b SBLAT3 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* PROGRAM SBLAT3 +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> Test program for the REAL Level 3 Blas. +*> +*> The program must be driven by a short data file. The first 14 records +*> of the file are read using list-directed input, the last 6 records +*> are read using the format ( A6, L2 ). An annotated example of a data +*> file can be obtained by deleting the first 3 characters from the +*> following 20 lines: +*> 'sblat3.out' NAME OF SUMMARY OUTPUT FILE +*> 6 UNIT NUMBER OF SUMMARY FILE +*> 'SBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +*> -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +*> F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +*> F LOGICAL FLAG, T TO STOP ON FAILURES. +*> T LOGICAL FLAG, T TO TEST ERROR EXITS. +*> 16.0 THRESHOLD VALUE OF TEST RATIO +*> 6 NUMBER OF VALUES OF N +*> 0 1 2 3 5 9 VALUES OF N +*> 3 NUMBER OF VALUES OF ALPHA +*> 0.0 1.0 0.7 VALUES OF ALPHA +*> 3 NUMBER OF VALUES OF BETA +*> 0.0 1.0 1.3 VALUES OF BETA +*> SGEMM T PUT F FOR NO TEST. SAME COLUMNS. +*> SSYMM T PUT F FOR NO TEST. SAME COLUMNS. +*> STRMM T PUT F FOR NO TEST. SAME COLUMNS. +*> STRSM T PUT F FOR NO TEST. SAME COLUMNS. +*> SSYRK T PUT F FOR NO TEST. SAME COLUMNS. +*> SSYR2K T PUT F FOR NO TEST. SAME COLUMNS. +*> +*> Further Details +*> =============== +*> +*> See: +*> +*> Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. +*> A Set of Level 3 Basic Linear Algebra Subprograms. +*> +*> Technical Memorandum No.88 (Revision 1), Mathematics and +*> Computer Science Division, Argonne National Laboratory, 9700 +*> South Cass Avenue, Argonne, Illinois 60439, US. +*> +*> -- Written on 8-February-1989. +*> Jack Dongarra, Argonne National Laboratory. +*> Iain Duff, AERE Harwell. +*> Jeremy Du Croz, Numerical Algorithms Group Ltd. +*> Sven Hammarling, Numerical Algorithms Group Ltd. +*> +*> 10-9-00: Change STATUS='NEW' to 'UNKNOWN' so that the testers +*> can be run multiple times without deleting generated +*> output files (susan) +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date April 2012 +* +*> \ingroup single_blas_testing +* +* ===================================================================== PROGRAM SBLAT3 * -* Test program for the REAL Level 3 Blas. +* -- Reference BLAS test routine (version 3.7.0) -- +* -- Reference BLAS is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* April 2012 * -* The program must be driven by a short data file. The first 14 records -* of the file are read using list-directed input, the last 6 records -* are read using the format ( A6, L2 ). An annotated example of a data -* file can be obtained by deleting the first 3 characters from the -* following 20 lines: -* 'SBLAT3.SUMM' NAME OF SUMMARY OUTPUT FILE -* 6 UNIT NUMBER OF SUMMARY FILE -* 'SBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE -* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) -* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO STOP ON FAILURES. -* T LOGICAL FLAG, T TO TEST ERROR EXITS. -* 16.0 THRESHOLD VALUE OF TEST RATIO -* 6 NUMBER OF VALUES OF N -* 0 1 2 3 5 9 VALUES OF N -* 3 NUMBER OF VALUES OF ALPHA -* 0.0 1.0 0.7 VALUES OF ALPHA -* 3 NUMBER OF VALUES OF BETA -* 0.0 1.0 1.3 VALUES OF BETA -* SGEMM T PUT F FOR NO TEST. SAME COLUMNS. -* SSYMM T PUT F FOR NO TEST. SAME COLUMNS. -* STRMM T PUT F FOR NO TEST. SAME COLUMNS. -* STRSM T PUT F FOR NO TEST. SAME COLUMNS. -* SSYRK T PUT F FOR NO TEST. SAME COLUMNS. -* SSYR2K T PUT F FOR NO TEST. SAME COLUMNS. -* -* See: -* -* Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. -* A Set of Level 3 Basic Linear Algebra Subprograms. -* -* Technical Memorandum No.88 (Revision 1), Mathematics and -* Computer Science Division, Argonne National Laboratory, 9700 -* South Cass Avenue, Argonne, Illinois 60439, US. -* -* -- Written on 8-February-1989. -* Jack Dongarra, Argonne National Laboratory. -* Iain Duff, AERE Harwell. -* Jeremy Du Croz, Numerical Algorithms Group Ltd. -* Sven Hammarling, Numerical Algorithms Group Ltd. +* ===================================================================== * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 5 ) INTEGER NSUBS PARAMETER ( NSUBS = 6 ) - REAL ZERO, HALF, ONE - PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0, ONE = 1.0 ) INTEGER NMAX PARAMETER ( NMAX = 65 ) INTEGER NIDMAX, NALMAX, NBEMAX @@ -96,7 +142,7 @@ * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT - OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) + OPEN( NOUT, FILE = SUMMRY ) NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. @@ -105,7 +151,7 @@ READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN - OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + OPEN( NTRA, FILE = SNAPS ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI @@ -182,14 +228,7 @@ * * Compute EPS (the machine precision). * - EPS = ONE - 70 CONTINUE - IF( SDIFF( ONE + EPS, ONE ).EQ.ZERO ) - $ GO TO 80 - EPS = HALF*EPS - GO TO 70 - 80 CONTINUE - EPS = EPS + EPS + EPS = EPSILON(ZERO) WRITE( NOUT, FMT = 9998 )EPS * * Check the reliability of SMMCH using exact data. @@ -1802,7 +1841,7 @@ * * Tests the error exits from the Level 3 Blas. * Requires a special version of the error-handling routine XERBLA. -* ALPHA, BETA, A, B and C should not need to be defined. +* A, B and C should not need to be defined. * * Auxiliary routine for test program for Level 3 Blas. * @@ -1812,12 +1851,18 @@ * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * +* 3-19-92: Initialize ALPHA and BETA (eca) +* 3-19-92: Fix argument 12 in calls to SSYMM with INFOT = 9 (eca) +* * .. Scalar Arguments .. INTEGER ISNUM, NOUT CHARACTER*6 SRNAMT * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK +* .. Parameters .. + REAL ONE, TWO + PARAMETER ( ONE = 1.0E0, TWO = 2.0E0 ) * .. Local Scalars .. REAL ALPHA, BETA * .. Local Arrays .. @@ -1834,6 +1879,12 @@ * LERR is set to .TRUE. by the special version of XERBLA each time * it is called, and is then tested and re-set by CHKXER. LERR = .FALSE. +* +* Initialize ALPHA and BETA. +* + ALPHA = ONE + BETA = TWO +* GO TO ( 10, 20, 30, 40, 50, 60 )ISNUM 10 INFOT = 1 CALL SGEMM( '/', 'N', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) @@ -1963,16 +2014,16 @@ CALL SSYMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL SSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL SSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL SSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL SSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL SSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL SSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL SSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL SSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL SSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) @@ -2660,7 +2711,6 @@ 50 CONTINUE END IF * - 60 CONTINUE LSERES = .TRUE. GO TO 80 70 CONTINUE diff --git a/test/zblat1.f b/test/zblat1.f index 8b4b8d21e..2d7b88490 100644 --- a/test/zblat1.f +++ b/test/zblat1.f @@ -1,7 +1,49 @@ +*> \brief \b ZBLAT1 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* PROGRAM ZBLAT1 +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> Test program for the COMPLEX*16 Level 1 BLAS. +*> +*> Based upon the original BLAS test routine together with: +*> F06GAF Example Program Text +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date April 2012 +* +*> \ingroup complex16_blas_testing +* +* ===================================================================== PROGRAM ZBLAT1 -* Test program for the COMPLEX*16 Level 1 BLAS. -* Based upon the original BLAS test routine together with: -* F06GAF Example Program Text +* +* -- Reference BLAS test routine (version 3.7.0) -- +* -- Reference BLAS is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* April 2012 +* +* ===================================================================== +* * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) @@ -114,8 +156,8 @@ + (5.0D0,6.0D0), (5.0D0,6.0D0), (0.1D0,0.1D0), + (-0.6D0,0.1D0), (0.1D0,-0.3D0), (7.0D0,8.0D0), + (7.0D0,8.0D0), (7.0D0,8.0D0), (7.0D0,8.0D0), - + (7.0D0,8.0D0), (0.3D0,0.1D0), (0.1D0,0.4D0), - + (0.4D0,0.1D0), (0.1D0,0.2D0), (2.0D0,3.0D0), + + (7.0D0,8.0D0), (0.3D0,0.1D0), (0.5D0,0.0D0), + + (0.0D0,0.5D0), (0.0D0,0.2D0), (2.0D0,3.0D0), + (2.0D0,3.0D0), (2.0D0,3.0D0), (2.0D0,3.0D0)/ DATA ((CV(I,J,2),I=1,8),J=1,5)/(0.1D0,0.1D0), + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), @@ -129,10 +171,10 @@ + (3.0D0,6.0D0), (-0.6D0,0.1D0), (4.0D0,7.0D0), + (0.1D0,-0.3D0), (7.0D0,2.0D0), (7.0D0,2.0D0), + (7.0D0,2.0D0), (0.3D0,0.1D0), (5.0D0,8.0D0), - + (0.1D0,0.4D0), (6.0D0,9.0D0), (0.4D0,0.1D0), - + (8.0D0,3.0D0), (0.1D0,0.2D0), (9.0D0,4.0D0)/ - DATA STRUE2/0.0D0, 0.5D0, 0.6D0, 0.7D0, 0.7D0/ - DATA STRUE4/0.0D0, 0.7D0, 1.0D0, 1.3D0, 1.7D0/ + + (0.5D0,0.0D0), (6.0D0,9.0D0), (0.0D0,0.5D0), + + (8.0D0,3.0D0), (0.0D0,0.2D0), (9.0D0,4.0D0)/ + DATA STRUE2/0.0D0, 0.5D0, 0.6D0, 0.7D0, 0.8D0/ + DATA STRUE4/0.0D0, 0.7D0, 1.0D0, 1.3D0, 1.6D0/ DATA ((CTRUE5(I,J,1),I=1,8),J=1,5)/(0.1D0,0.1D0), + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), @@ -145,8 +187,8 @@ + (0.11D0,-0.03D0), (-0.17D0,0.46D0), + (-0.17D0,-0.19D0), (7.0D0,8.0D0), (7.0D0,8.0D0), + (7.0D0,8.0D0), (7.0D0,8.0D0), (7.0D0,8.0D0), - + (0.19D0,-0.17D0), (0.32D0,0.09D0), - + (0.23D0,-0.24D0), (0.18D0,0.01D0), + + (0.19D0,-0.17D0), (0.20D0,-0.35D0), + + (0.35D0,0.20D0), (0.14D0,0.08D0), + (2.0D0,3.0D0), (2.0D0,3.0D0), (2.0D0,3.0D0), + (2.0D0,3.0D0)/ DATA ((CTRUE5(I,J,2),I=1,8),J=1,5)/(0.1D0,0.1D0), @@ -162,9 +204,9 @@ + (-0.17D0,0.46D0), (4.0D0,7.0D0), + (-0.17D0,-0.19D0), (7.0D0,2.0D0), (7.0D0,2.0D0), + (7.0D0,2.0D0), (0.19D0,-0.17D0), (5.0D0,8.0D0), - + (0.32D0,0.09D0), (6.0D0,9.0D0), - + (0.23D0,-0.24D0), (8.0D0,3.0D0), - + (0.18D0,0.01D0), (9.0D0,4.0D0)/ + + (0.20D0,-0.35D0), (6.0D0,9.0D0), + + (0.35D0,0.20D0), (8.0D0,3.0D0), + + (0.14D0,0.08D0), (9.0D0,4.0D0)/ DATA ((CTRUE6(I,J,1),I=1,8),J=1,5)/(0.1D0,0.1D0), + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), @@ -177,8 +219,8 @@ + (0.03D0,0.03D0), (-0.18D0,0.03D0), + (0.03D0,-0.09D0), (7.0D0,8.0D0), (7.0D0,8.0D0), + (7.0D0,8.0D0), (7.0D0,8.0D0), (7.0D0,8.0D0), - + (0.09D0,0.03D0), (0.03D0,0.12D0), - + (0.12D0,0.03D0), (0.03D0,0.06D0), (2.0D0,3.0D0), + + (0.09D0,0.03D0), (0.15D0,0.00D0), + + (0.00D0,0.15D0), (0.00D0,0.06D0), (2.0D0,3.0D0), + (2.0D0,3.0D0), (2.0D0,3.0D0), (2.0D0,3.0D0)/ DATA ((CTRUE6(I,J,2),I=1,8),J=1,5)/(0.1D0,0.1D0), + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), @@ -193,8 +235,8 @@ + (-0.18D0,0.03D0), (4.0D0,7.0D0), + (0.03D0,-0.09D0), (7.0D0,2.0D0), (7.0D0,2.0D0), + (7.0D0,2.0D0), (0.09D0,0.03D0), (5.0D0,8.0D0), - + (0.03D0,0.12D0), (6.0D0,9.0D0), (0.12D0,0.03D0), - + (8.0D0,3.0D0), (0.03D0,0.06D0), (9.0D0,4.0D0)/ + + (0.15D0,0.00D0), (6.0D0,9.0D0), (0.00D0,0.15D0), + + (8.0D0,3.0D0), (0.00D0,0.06D0), (9.0D0,4.0D0)/ DATA ITRUE3/0, 1, 2, 2, 2/ * .. Executable Statements .. DO 60 INCX = 1, 2 @@ -529,7 +571,8 @@ * * .. Parameters .. INTEGER NOUT - PARAMETER (NOUT=6) + DOUBLE PRECISION ZERO + PARAMETER (NOUT=6, ZERO=0.0D0) * .. Scalar Arguments .. DOUBLE PRECISION SFAC INTEGER LEN @@ -552,7 +595,7 @@ * DO 40 I = 1, LEN SD = SCOMP(I) - STRUE(I) - IF (SDIFF(ABS(SSIZE(I))+ABS(SFAC*SD),ABS(SSIZE(I))).EQ.0.0D0) + IF (ABS(SFAC*SD) .LE. ABS(SSIZE(I))*EPSILON(ZERO)) + GO TO 40 * * HERE SCOMP(I) IS NOT CLOSE TO STRUE(I). diff --git a/test/zblat2.f b/test/zblat2.f index e65cdcc70..4a20ac567 100644 --- a/test/zblat2.f +++ b/test/zblat2.f @@ -1,68 +1,114 @@ +*> \brief \b ZBLAT2 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* PROGRAM ZBLAT2 +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> Test program for the COMPLEX*16 Level 2 Blas. +*> +*> The program must be driven by a short data file. The first 18 records +*> of the file are read using list-directed input, the last 17 records +*> are read using the format ( A6, L2 ). An annotated example of a data +*> file can be obtained by deleting the first 3 characters from the +*> following 35 lines: +*> 'zblat2.out' NAME OF SUMMARY OUTPUT FILE +*> 6 UNIT NUMBER OF SUMMARY FILE +*> 'CBLA2T.SNAP' NAME OF SNAPSHOT OUTPUT FILE +*> -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +*> F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +*> F LOGICAL FLAG, T TO STOP ON FAILURES. +*> T LOGICAL FLAG, T TO TEST ERROR EXITS. +*> 16.0 THRESHOLD VALUE OF TEST RATIO +*> 6 NUMBER OF VALUES OF N +*> 0 1 2 3 5 9 VALUES OF N +*> 4 NUMBER OF VALUES OF K +*> 0 1 2 4 VALUES OF K +*> 4 NUMBER OF VALUES OF INCX AND INCY +*> 1 2 -1 -2 VALUES OF INCX AND INCY +*> 3 NUMBER OF VALUES OF ALPHA +*> (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +*> 3 NUMBER OF VALUES OF BETA +*> (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +*> ZGEMV T PUT F FOR NO TEST. SAME COLUMNS. +*> ZGBMV T PUT F FOR NO TEST. SAME COLUMNS. +*> ZHEMV T PUT F FOR NO TEST. SAME COLUMNS. +*> ZHBMV T PUT F FOR NO TEST. SAME COLUMNS. +*> ZHPMV T PUT F FOR NO TEST. SAME COLUMNS. +*> ZTRMV T PUT F FOR NO TEST. SAME COLUMNS. +*> ZTBMV T PUT F FOR NO TEST. SAME COLUMNS. +*> ZTPMV T PUT F FOR NO TEST. SAME COLUMNS. +*> ZTRSV T PUT F FOR NO TEST. SAME COLUMNS. +*> ZTBSV T PUT F FOR NO TEST. SAME COLUMNS. +*> ZTPSV T PUT F FOR NO TEST. SAME COLUMNS. +*> ZGERC T PUT F FOR NO TEST. SAME COLUMNS. +*> ZGERU T PUT F FOR NO TEST. SAME COLUMNS. +*> ZHER T PUT F FOR NO TEST. SAME COLUMNS. +*> ZHPR T PUT F FOR NO TEST. SAME COLUMNS. +*> ZHER2 T PUT F FOR NO TEST. SAME COLUMNS. +*> ZHPR2 T PUT F FOR NO TEST. SAME COLUMNS. +*> +*> Further Details +*> =============== +*> +*> See: +*> +*> Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. +*> An extended set of Fortran Basic Linear Algebra Subprograms. +*> +*> Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics +*> and Computer Science Division, Argonne National Laboratory, +*> 9700 South Cass Avenue, Argonne, Illinois 60439, US. +*> +*> Or +*> +*> NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms +*> Group Ltd., NAG Central Office, 256 Banbury Road, Oxford +*> OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st +*> Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. +*> +*> +*> -- Written on 10-August-1987. +*> Richard Hanson, Sandia National Labs. +*> Jeremy Du Croz, NAG Central Office. +*> +*> 10-9-00: Change STATUS='NEW' to 'UNKNOWN' so that the testers +*> can be run multiple times without deleting generated +*> output files (susan) +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date April 2012 +* +*> \ingroup complex16_blas_testing +* +* ===================================================================== PROGRAM ZBLAT2 * -* Test program for the COMPLEX*16 Level 2 Blas. +* -- Reference BLAS test routine (version 3.7.0) -- +* -- Reference BLAS is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* April 2012 * -* The program must be driven by a short data file. The first 18 records -* of the file are read using list-directed input, the last 17 records -* are read using the format ( A6, L2 ). An annotated example of a data -* file can be obtained by deleting the first 3 characters from the -* following 35 lines: -* 'ZBLAT2.SUMM' NAME OF SUMMARY OUTPUT FILE -* 6 UNIT NUMBER OF SUMMARY FILE -* 'CBLA2T.SNAP' NAME OF SNAPSHOT OUTPUT FILE -* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) -* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO STOP ON FAILURES. -* T LOGICAL FLAG, T TO TEST ERROR EXITS. -* 16.0 THRESHOLD VALUE OF TEST RATIO -* 6 NUMBER OF VALUES OF N -* 0 1 2 3 5 9 VALUES OF N -* 4 NUMBER OF VALUES OF K -* 0 1 2 4 VALUES OF K -* 4 NUMBER OF VALUES OF INCX AND INCY -* 1 2 -1 -2 VALUES OF INCX AND INCY -* 3 NUMBER OF VALUES OF ALPHA -* (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA -* 3 NUMBER OF VALUES OF BETA -* (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA -* ZGEMV T PUT F FOR NO TEST. SAME COLUMNS. -* ZGBMV T PUT F FOR NO TEST. SAME COLUMNS. -* ZHEMV T PUT F FOR NO TEST. SAME COLUMNS. -* ZHBMV T PUT F FOR NO TEST. SAME COLUMNS. -* ZHPMV T PUT F FOR NO TEST. SAME COLUMNS. -* ZTRMV T PUT F FOR NO TEST. SAME COLUMNS. -* ZTBMV T PUT F FOR NO TEST. SAME COLUMNS. -* ZTPMV T PUT F FOR NO TEST. SAME COLUMNS. -* ZTRSV T PUT F FOR NO TEST. SAME COLUMNS. -* ZTBSV T PUT F FOR NO TEST. SAME COLUMNS. -* ZTPSV T PUT F FOR NO TEST. SAME COLUMNS. -* ZGERC T PUT F FOR NO TEST. SAME COLUMNS. -* ZGERU T PUT F FOR NO TEST. SAME COLUMNS. -* ZHER T PUT F FOR NO TEST. SAME COLUMNS. -* ZHPR T PUT F FOR NO TEST. SAME COLUMNS. -* ZHER2 T PUT F FOR NO TEST. SAME COLUMNS. -* ZHPR2 T PUT F FOR NO TEST. SAME COLUMNS. -* -* See: -* -* Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. -* An extended set of Fortran Basic Linear Algebra Subprograms. -* -* Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics -* and Computer Science Division, Argonne National Laboratory, -* 9700 South Cass Avenue, Argonne, Illinois 60439, US. -* -* Or -* -* NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms -* Group Ltd., NAG Central Office, 256 Banbury Road, Oxford -* OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st -* Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. -* -* -* -- Written on 10-August-1987. -* Richard Hanson, Sandia National Labs. -* Jeremy Du Croz, NAG Central Office. +* ===================================================================== * * .. Parameters .. INTEGER NIN @@ -72,8 +118,8 @@ COMPLEX*16 ZERO, ONE PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), $ ONE = ( 1.0D0, 0.0D0 ) ) - DOUBLE PRECISION RZERO, RHALF, RONE - PARAMETER ( RZERO = 0.0D0, RHALF = 0.5D0, RONE = 1.0D0 ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) INTEGER NMAX, INCMAX PARAMETER ( NMAX = 65, INCMAX = 2 ) INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX @@ -127,7 +173,7 @@ * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT - OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) + OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. @@ -136,7 +182,7 @@ READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN - OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI @@ -241,14 +287,7 @@ * * Compute EPS (the machine precision). * - EPS = RONE - 90 CONTINUE - IF( DDIFF( RONE + EPS, RONE ).EQ.RZERO ) - $ GO TO 100 - EPS = RHALF*EPS - GO TO 90 - 100 CONTINUE - EPS = EPS + EPS + EPS = EPSILON(RZERO) WRITE( NOUT, FMT = 9998 )EPS * * Check the reliability of ZMVCH using exact data. @@ -3087,7 +3126,6 @@ 50 CONTINUE END IF * - 60 CONTINUE LZERES = .TRUE. GO TO 80 70 CONTINUE diff --git a/test/zblat3.f b/test/zblat3.f index f03b1a617..0e38334e9 100644 --- a/test/zblat3.f +++ b/test/zblat3.f @@ -1,50 +1,97 @@ +*> \brief \b ZBLAT3 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* PROGRAM ZBLAT3 +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> Test program for the COMPLEX*16 Level 3 Blas. +*> +*> The program must be driven by a short data file. The first 14 records +*> of the file are read using list-directed input, the last 9 records +*> are read using the format ( A6, L2 ). An annotated example of a data +*> file can be obtained by deleting the first 3 characters from the +*> following 23 lines: +*> 'zblat3.out' NAME OF SUMMARY OUTPUT FILE +*> 6 UNIT NUMBER OF SUMMARY FILE +*> 'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +*> -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +*> F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +*> F LOGICAL FLAG, T TO STOP ON FAILURES. +*> T LOGICAL FLAG, T TO TEST ERROR EXITS. +*> 16.0 THRESHOLD VALUE OF TEST RATIO +*> 6 NUMBER OF VALUES OF N +*> 0 1 2 3 5 9 VALUES OF N +*> 3 NUMBER OF VALUES OF ALPHA +*> (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +*> 3 NUMBER OF VALUES OF BETA +*> (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +*> ZGEMM T PUT F FOR NO TEST. SAME COLUMNS. +*> ZHEMM T PUT F FOR NO TEST. SAME COLUMNS. +*> ZSYMM T PUT F FOR NO TEST. SAME COLUMNS. +*> ZTRMM T PUT F FOR NO TEST. SAME COLUMNS. +*> ZTRSM T PUT F FOR NO TEST. SAME COLUMNS. +*> ZHERK T PUT F FOR NO TEST. SAME COLUMNS. +*> ZSYRK T PUT F FOR NO TEST. SAME COLUMNS. +*> ZHER2K T PUT F FOR NO TEST. SAME COLUMNS. +*> ZSYR2K T PUT F FOR NO TEST. SAME COLUMNS. +*> +*> +*> Further Details +*> =============== +*> +*> See: +*> +*> Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. +*> A Set of Level 3 Basic Linear Algebra Subprograms. +*> +*> Technical Memorandum No.88 (Revision 1), Mathematics and +*> Computer Science Division, Argonne National Laboratory, 9700 +*> South Cass Avenue, Argonne, Illinois 60439, US. +*> +*> -- Written on 8-February-1989. +*> Jack Dongarra, Argonne National Laboratory. +*> Iain Duff, AERE Harwell. +*> Jeremy Du Croz, Numerical Algorithms Group Ltd. +*> Sven Hammarling, Numerical Algorithms Group Ltd. +*> +*> 10-9-00: Change STATUS='NEW' to 'UNKNOWN' so that the testers +*> can be run multiple times without deleting generated +*> output files (susan) +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date April 2012 +* +*> \ingroup complex16_blas_testing +* +* ===================================================================== PROGRAM ZBLAT3 * -* Test program for the COMPLEX*16 Level 3 Blas. +* -- Reference BLAS test routine (version 3.7.0) -- +* -- Reference BLAS is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* April 2012 * -* The program must be driven by a short data file. The first 14 records -* of the file are read using list-directed input, the last 9 records -* are read using the format ( A6, L2 ). An annotated example of a data -* file can be obtained by deleting the first 3 characters from the -* following 23 lines: -* 'ZBLAT3.SUMM' NAME OF SUMMARY OUTPUT FILE -* 6 UNIT NUMBER OF SUMMARY FILE -* 'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE -* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) -* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO STOP ON FAILURES. -* T LOGICAL FLAG, T TO TEST ERROR EXITS. -* 16.0 THRESHOLD VALUE OF TEST RATIO -* 6 NUMBER OF VALUES OF N -* 0 1 2 3 5 9 VALUES OF N -* 3 NUMBER OF VALUES OF ALPHA -* (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA -* 3 NUMBER OF VALUES OF BETA -* (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA -* ZGEMM T PUT F FOR NO TEST. SAME COLUMNS. -* ZHEMM T PUT F FOR NO TEST. SAME COLUMNS. -* ZSYMM T PUT F FOR NO TEST. SAME COLUMNS. -* ZTRMM T PUT F FOR NO TEST. SAME COLUMNS. -* ZTRSM T PUT F FOR NO TEST. SAME COLUMNS. -* ZHERK T PUT F FOR NO TEST. SAME COLUMNS. -* ZSYRK T PUT F FOR NO TEST. SAME COLUMNS. -* ZHER2K T PUT F FOR NO TEST. SAME COLUMNS. -* ZSYR2K T PUT F FOR NO TEST. SAME COLUMNS. -* -* See: -* -* Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. -* A Set of Level 3 Basic Linear Algebra Subprograms. -* -* Technical Memorandum No.88 (Revision 1), Mathematics and -* Computer Science Division, Argonne National Laboratory, 9700 -* South Cass Avenue, Argonne, Illinois 60439, US. -* -* -- Written on 8-February-1989. -* Jack Dongarra, Argonne National Laboratory. -* Iain Duff, AERE Harwell. -* Jeremy Du Croz, Numerical Algorithms Group Ltd. -* Sven Hammarling, Numerical Algorithms Group Ltd. +* ===================================================================== * * .. Parameters .. INTEGER NIN @@ -54,8 +101,8 @@ COMPLEX*16 ZERO, ONE PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), $ ONE = ( 1.0D0, 0.0D0 ) ) - DOUBLE PRECISION RZERO, RHALF, RONE - PARAMETER ( RZERO = 0.0D0, RHALF = 0.5D0, RONE = 1.0D0 ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) INTEGER NMAX PARAMETER ( NMAX = 65 ) INTEGER NIDMAX, NALMAX, NBEMAX @@ -104,7 +151,7 @@ * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT - OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) + OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. @@ -113,7 +160,7 @@ READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN - OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI @@ -190,14 +237,7 @@ * * Compute EPS (the machine precision). * - EPS = RONE - 70 CONTINUE - IF( DDIFF( RONE + EPS, RONE ).EQ.RZERO ) - $ GO TO 80 - EPS = RHALF*EPS - GO TO 70 - 80 CONTINUE - EPS = EPS + EPS + EPS = EPSILON(RZERO) WRITE( NOUT, FMT = 9998 )EPS * * Check the reliability of ZMMCH using exact data. @@ -1303,8 +1343,6 @@ NC = 0 RESET = .TRUE. ERRMAX = RZERO - RALS = RONE - RBETS = RONE * DO 100 IN = 1, NIDIM N = IDIM( IN ) @@ -1951,7 +1989,7 @@ * * Tests the error exits from the Level 3 Blas. * Requires a special version of the error-handling routine XERBLA. -* ALPHA, RALPHA, BETA, RBETA, A, B and C should not need to be defined. +* A, B and C should not need to be defined. * * Auxiliary routine for test program for Level 3 Blas. * @@ -1961,12 +1999,20 @@ * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * +* 3-19-92: Initialize ALPHA, BETA, RALPHA, and RBETA (eca) +* 3-19-92: Fix argument 12 in calls to ZSYMM and ZHEMM +* with INFOT = 9 (eca) +* 10-9-00: Declared INTRINSIC DCMPLX (susan) +* * .. Scalar Arguments .. INTEGER ISNUM, NOUT CHARACTER*6 SRNAMT * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK +* .. Parameters .. + REAL ONE, TWO + PARAMETER ( ONE = 1.0D0, TWO = 2.0D0 ) * .. Local Scalars .. COMPLEX*16 ALPHA, BETA DOUBLE PRECISION RALPHA, RBETA @@ -1975,6 +2021,8 @@ * .. External Subroutines .. EXTERNAL ZGEMM, ZHEMM, ZHER2K, ZHERK, CHKXER, ZSYMM, $ ZSYR2K, ZSYRK, ZTRMM, ZTRSM +* .. Intrinsic Functions .. + INTRINSIC DCMPLX * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Executable Statements .. @@ -1984,6 +2032,14 @@ * LERR is set to .TRUE. by the special version of XERBLA each time * it is called, and is then tested and re-set by CHKXER. LERR = .FALSE. +* +* Initialize ALPHA, BETA, RALPHA, and RBETA. +* + ALPHA = DCMPLX( ONE, -ONE ) + BETA = DCMPLX( TWO, -TWO ) + RALPHA = ONE + RBETA = TWO +* GO TO ( 10, 20, 30, 40, 50, 60, 70, 80, $ 90 )ISNUM 10 INFOT = 1 @@ -2210,16 +2266,16 @@ CALL ZHEMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL ZHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL ZHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL ZHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL ZHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL ZHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL ZHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL ZHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL ZHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL ZHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) @@ -2277,16 +2333,16 @@ CALL ZSYMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL ZSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL ZSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL ZSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL ZSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL ZSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL ZSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL ZSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL ZSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL ZSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) @@ -3276,7 +3332,6 @@ 50 CONTINUE END IF * - 60 CONTINUE LZERES = .TRUE. GO TO 80 70 CONTINUE