diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml new file mode 100644 index 000000000..ca53e8857 --- /dev/null +++ b/.github/workflows/dynamic_arch.yml @@ -0,0 +1,103 @@ +name: continuous build + +on: [push, pull_request] + +jobs: + build: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, macos-latest] + fortran: [gfortran, flang] + build: [cmake, make] + steps: + - name: Checkout repository + uses: actions/checkout@v2 + + - name: Compilation cache + uses: actions/cache@v2 + with: + path: ~/.ccache + # We include the commit sha in the cache key, as new cache entries are + # only created if there is no existing entry for the key yet. + key: ${{ runner.os }}-ccache-${{ github.sha }} + # Restore any ccache cache entry, if none for + # ${{ runner.os }}-ccache-${{ github.sha }} exists + restore-keys: | + ${{ runner.os }}-ccache- + + - name: Print system information + run: | + if [ "$RUNNER_OS" == "Linux" ]; then + cat /proc/cpuinfo + elif [ "$RUNNER_OS" == "macOS" ]; then + sysctl -a | grep machdep.cpu + else + echo "$RUNNER_OS not supported" + exit 1 + fi + + - name: Install Dependencies + run: | + if [ "$RUNNER_OS" == "Linux" ]; then + sudo apt-get install -y gfortran cmake ccache + elif [ "$RUNNER_OS" == "macOS" ]; then + brew install coreutils cmake ccache + else + echo "$RUNNER_OS not supported" + exit 1 + fi + ccache -M 300M # Limit the ccache size; Github's overall cache limit is 5GB + + - name: gfortran build + if: matrix.build == 'make' && matrix.fortran == 'gfortran' + run: | + if [ "$RUNNER_OS" == "Linux" ]; then + export PATH="/usr/lib/ccache:${PATH}" + elif [ "$RUNNER_OS" == "macOS" ]; then + export PATH="$(brew --prefix)/opt/ccache/libexec:${PATH}" + else + echo "$RUNNER_OS not supported" + exit 1 + fi + + make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0 + + - name: flang build + if: matrix.build == 'make' && matrix.fortran == 'flang' + run: | + if [ "$RUNNER_OS" == "Linux" ]; then + export PATH="/usr/lib/ccache:${PATH}" + elif [ "$RUNNER_OS" == "macOS" ]; then + exit 0 + else + echo "$RUNNER_OS not supported" + exit 1 + fi + + cd /usr/ + sudo wget -nv https://github.com/flang-compiler/flang/releases/download/flang_20190329/flang-20190329-x86-70.tgz + sudo tar xf flang-20190329-x86-70.tgz + sudo rm flang-20190329-x86-70.tgz + cd - + + make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0 FC=flang + + + - name: CMake gfortran build + if: matrix.build == 'cmake' && matrix.fortran == 'gfortran' + run: | + if [ "$RUNNER_OS" == "Linux" ]; then + export PATH="/usr/lib/ccache:${PATH}" + elif [ "$RUNNER_OS" == "macOS" ]; then + export PATH="$(brew --prefix)/opt/ccache/libexec:${PATH}" + else + echo "$RUNNER_OS not supported" + exit 1 + fi + + mkdir build + cd build + cmake -DDYNAMIC_ARCH=1 -DNOFORTRAN=0 -DBUILD_WITHOUT_LAPACK=0 -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_BUILD_TYPE=Release .. + make -j$(nproc) diff --git a/.github/workflows/nightly-Homebrew-build.yml b/.github/workflows/nightly-Homebrew-build.yml index f55e73d23..8d7cfea2d 100644 --- a/.github/workflows/nightly-Homebrew-build.yml +++ b/.github/workflows/nightly-Homebrew-build.yml @@ -21,6 +21,7 @@ jobs: build-OpenBLAS-with-Homebrew: runs-on: macos-latest env: + DEVELOPER_DIR: /Applications/Xcode_11.4.1.app/Contents/Developer HOMEBREW_DEVELOPER: "ON" HOMEBREW_DISPLAY_INSTALL_TIMES: "ON" HOMEBREW_NO_ANALYTICS: "ON" diff --git a/.gitignore b/.gitignore index 6803a919e..bca79f043 100644 --- a/.gitignore +++ b/.gitignore @@ -70,6 +70,7 @@ test/SBLAT2.SUMM test/SBLAT3.SUMM test/ZBLAT2.SUMM test/ZBLAT3.SUMM +test/SHBLAT3.SUMM test/cblat1 test/cblat2 test/cblat3 @@ -79,6 +80,7 @@ test/dblat3 test/sblat1 test/sblat2 test/sblat3 +test/test_shgemm test/zblat1 test/zblat2 test/zblat3 diff --git a/.travis.yml b/.travis.yml index c875572b2..307010e40 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,7 +16,6 @@ matrix: before_script: &common-before - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" script: - - set -e - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE - make -C test $COMMON_FLAGS $BTYPE - make -C ctest $COMMON_FLAGS $BTYPE @@ -76,6 +75,23 @@ matrix: - TARGET_BOX=LINUX32 - BTYPE="BINARY=32" + - os: linux + arch: ppc64le + dist: bionic + compiler: gcc + before_script: + - sudo add-apt-repository 'ppa:ubuntu-toolchain-r/test' -y + - sudo apt-get update + - sudo apt-get install gcc-9 gfortran-9 -y + script: + - make QUIET_MAKE=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9 + - make -C test $COMMON_FLAGS $BTYPE + - make -C ctest $COMMON_FLAGS $BTYPE + - make -C utest $COMMON_FLAGS $BTYPE + env: + # for matrix annotation only + - TARGET_BOX=PPC64LE_LINUX_P9 + - os: linux compiler: gcc addons: @@ -108,7 +124,6 @@ matrix: - sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers' before_script: *common-before script: - - set -e # XXX: Disable some warnings for now to avoid exceeding Travis limit for log size. - alpine make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE CFLAGS="-Wno-misleading-indentation -Wno-sign-conversion -Wno-incompatible-pointer-types" @@ -151,7 +166,6 @@ matrix: before_script: - COMMON_ARGS="-DTARGET=NEHALEM -DNUM_THREADS=32" script: - - set -e - mkdir build - CONFIG=Release - cmake -Bbuild -H. $CMAKE_ARGS $COMMON_ARGS -DCMAKE_BUILD_TYPE=$CONFIG diff --git a/CMakeLists.txt b/CMakeLists.txt index c2b9ae7ad..4b82d7670 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 9.dev) +set(OpenBLAS_PATCH_VERSION 10.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions @@ -23,6 +23,7 @@ option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS fun option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF) option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF) option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF) +option(USE_LOCKING "Use locks even in single-threaded builds to make them callable from multiple threads" OFF) if(${CMAKE_SYSTEM_NAME} MATCHES "Linux") option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON) else() @@ -86,10 +87,13 @@ if (NOT NO_LAPACK) list(APPEND SUBDIRS lapack) endif () +if (NOT DEFINED BUILD_HALF) + set (BUILD_HALF false) +endif () # set which float types we want to build for if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16) # if none are defined, build for all - set(BUILD_HALF true) +# set(BUILD_HALF true) set(BUILD_SINGLE true) set(BUILD_DOUBLE true) set(BUILD_COMPLEX true) @@ -121,7 +125,7 @@ if (BUILD_COMPLEX16) list(APPEND FLOAT_TYPES "ZCOMPLEX") # defines COMPLEX and DOUBLE endif () -if (BUILD_SINGLE OR BUILD_HALF) +if (BUILD_HALF) message(STATUS "Building Half Precision") list(APPEND FLOAT_TYPES "HALF") # defines nothing endif () @@ -229,6 +233,7 @@ if (NOT MSVC AND NOT NOFORTRAN) if(NOT NO_CBLAS) add_subdirectory(ctest) endif() + add_subdirectory(lapack-netlib/TESTING) endif() set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES @@ -244,7 +249,7 @@ if (BUILD_SHARED_LIBS AND BUILD_RELAPACK) endif() endif() -if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFIX} STREQUAL "") +if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") if (NOT DEFINED ARCH) set(ARCH_IN "x86_64") else() @@ -353,10 +358,21 @@ endif() if(NOT NO_CBLAS) message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}") - set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h) file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS) string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") + if (NOT ${SYMBOLPREFIX} STREQUAL "") + string(REPLACE " cblas" " ${SYMBOLPREFIX}cblas" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") + string(REPLACE " openblas" " ${SYMBOLPREFIX}openblas" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") + string (REPLACE " ${SYMBOLPREFIX}openblas_complex" " openblas_complex" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") + string(REPLACE " goto" " ${SYMBOLPREFIX}goto" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") + endif() + if (NOT ${SYMBOLSUFFIX} STREQUAL "") + string(REGEX REPLACE "(cblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") + string(REGEX REPLACE "(openblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") + string(REGEX REPLACE "(openblas_complex[^ ]*)${SYMBOLSUFFIX}" "\\1" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") + string(REGEX REPLACE "(goto[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") + endif() file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}") install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) endif() @@ -373,11 +389,9 @@ if(NOT NO_LAPACKE) install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64}) endif() -include(FindPkgConfig QUIET) -if(PKG_CONFIG_FOUND) - configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc @ONLY) - install (FILES ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/) -endif() +# Install pkg-config files +configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc @ONLY) +install (FILES ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/) # GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share". diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index df497c1d2..aba39e56f 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -180,3 +180,13 @@ In chronological order: * [2019-12-23] optimize AVX2 CGEMM and ZGEMM * [2019-12-30] AVX2 CGEMM3M & ZGEMM3M kernels * [2020-01-07] optimize AVX2 SGEMM and STRMM + +* Rajalakshmi Srinivasaraghavan + * [2020-04-15] Half-precision GEMM for bfloat16 + +* Marius Hillenbrand + * [2020-05-12] Revise dynamic architecture detection for IBM z + * [2020-05-12] Add new sgemm and strmm kernel for IBM z14 + +* Danfeng Zhang + * [2020-05-20] Improve performance of SGEMM and STRMM on Arm Cortex-A53 \ No newline at end of file diff --git a/Changelog.txt b/Changelog.txt index 5f924629b..cbf0b50f5 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,77 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.10 + 14-Jun-2020 + +common: + * Improved thread locking behaviour in blas_server and parallel getrf + * Imported bugfix 394 from LAPACK (spurious reference to "XERBL" + due to overlong lines) + * Imported bugfix 403 from LAPACK (compile option "recursive" required + for correctness with Intel and PGI) + * Imported bugfix 408 from LAPACK (wrong scaling in ZHEEQUB) + * Imported bugfix 411 from LAPACK (infinite loop in LARGV/LARTG/LARTGP) + * Fixed mismatches between BUFFERSIZE and GEMM_UNROLL parameters that + could lead to crashes at large matrix sizes + * Restored internal soname in dynamic libraries on FreeBSD and Dragonfly + * Added API (openblas_setaffinity) to set the thread affinity on Linux + * Added initial infrastructure for half-precision floating point + (bfloat16) support with a generic implementation of SHGEMM + * Added CMAKE build system support for building the cblas_Xgemm3m + functions + * Fixed CMAKE support for building in a path with embedded spaces + * Fixed CMAKE (non)handling of NO_EXPRECISION and MAX_STACK_ALLOC + * Fixed GCC version detection in the Makefiles + * Allowed overriding the names of AR, AS and LD in Makefile builds + +POWER: + * Fixed big-endian POWER8 ELFv2 builds on FreeBSD + * Fixed GCC version checks and DYNAMIC_ARCH builds on POWER9 + * Fixed CMAKE build support for POWER9 + * fixed a potential race condition in the thread buffer allocation + * Worked around LAPACK test failures on PPC G4 + +MIPS: + * Fixed a potential race condition in the thread buffer allocation + * Added support for MIPS 24K/24KE family based on P5600 kernels + +MIPS64: + * fixed a potential race condition in the thread buffer allocation + * Added TARGET=GENERIC + +ARMV7: + * Fixed a race condition in the thread buffer allocation + +ARMV8: + * Fixed a race condition in the thread buffer allocation + * Fixed zero initialisation in the assembly for SGEMM and DGEMM BETA + * Improved performance of the ThunderX2 DAXPY kernel + * Added an optimized SGEMM kernel for Cortex A53 + * Fixed Makefile support for INTERFACE64 (8-byte integer) + +x86_64: + * Fixed a syntax error in the CMAKE setup for SkylakeX + * Improved performance of STRSM on Haswell, SkylakeX and Ryzen + * Improved SGEMM performance on SGEMM for workloads with ldc a + multiple of 1024 + * Improved DGEMM performance on Skylake X + * Fixed unwanted AVX512-dependency of SGEMM in DYNAMIC_ARCH + builds created on SkylakeX + * Removed data alignment requirement in the SSE2 copy kernels + that could cause spurious crashes + * Added a workaround for an optimizer bug in AppleClang 11.0.3 + * Fixed LAPACK test failures due to wrong options for Intel Fortran + * Fixed compilation and LAPACK test results with recent Flang + and AMD AOCC + * Fixed DYNAMIC_ARCH builds with CMAKE on OS X + * Fixed missing exports of cblas_i?amin, cblas_i?min, cblas_i?max, + cblas_?sum, cblas_?gemm3m in the shared library on OS + * Fixed reporting of cpu name in DYNAMIC_ARCH builds (would sometimes + show the name of an older generation chip supported by the same kernels) + +IBM Z: + * Improved performance of SGEMM/STRMM and DGEMM/DTRMM on Z14 + ==================================================================== Version 0.3.9 1-Mar-2020 diff --git a/Jenkinsfile b/Jenkinsfile new file mode 100644 index 000000000..2b61bed9f --- /dev/null +++ b/Jenkinsfile @@ -0,0 +1,9 @@ +node { + stage('Checkout') { + checkout + } + + stage('Build') { + sh("make") + } +} diff --git a/Makefile b/Makefile index 18320e6a3..7a03b08f0 100644 --- a/Makefile +++ b/Makefile @@ -141,7 +141,7 @@ ifndef NO_FBLAS $(MAKE) -C test all endif $(MAKE) -C utest all -ifndef NO_CBLAS +ifneq ($(NO_CBLAS), 1) $(MAKE) -C ctest all ifeq ($(CPP_THREAD_SAFETY_TEST), 1) $(MAKE) -C cpp_thread_test all @@ -244,7 +244,7 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib @$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib endif -ifndef NO_LAPACKE +ifneq ($(NO_LAPACKE), 1) @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapackelib endif endif @@ -264,6 +264,7 @@ lapack_prebuild : ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) -@echo "FC = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc -@echo "FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "FFLAGS_DRV = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc @@ -364,11 +365,12 @@ clean :: @$(MAKE) -C kernel clean #endif @$(MAKE) -C reference clean - @rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h + @rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h *.so.renamed *.a.renamed *.so.0 ifeq ($(OSNAME), Darwin) @rm -rf getarch.dSYM getarch_2nd.dSYM endif @rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib + @rm -f cblas.tmp cblas.tmp2 @touch $(NETLIB_LAPACK_DIR)/make.inc @$(MAKE) -C $(NETLIB_LAPACK_DIR) clean @rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h diff --git a/Makefile.arm64 b/Makefile.arm64 index a7cd82e3a..1091edfe5 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -56,6 +56,16 @@ CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 endif +ifeq ($(CORE), THUNDERX3T110) +ifeq ($(GCCVERSIONGTEQ10), 1) +CCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110 +FCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110 +else +CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 +FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 +endif +endif + ifeq ($(GCCVERSIONGTEQ9), 1) ifeq ($(CORE), TSV110) CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 diff --git a/Makefile.install b/Makefile.install index dad869f4c..7c1a3ca43 100644 --- a/Makefile.install +++ b/Makefile.install @@ -13,6 +13,14 @@ OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/openblas OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake OPENBLAS_CMAKE_CONFIG_VERSION := OpenBLASConfigVersion.cmake OPENBLAS_PKGCONFIG_DIR := $(OPENBLAS_LIBRARY_DIR)/pkgconfig +PKG_EXTRALIB := $(EXTRALIB) +ifeq ($(USE_OPENMP), 1) + ifeq ($(C_COMPILER), PGI) + PKG_EXTRALIB += -lomp + else + PKG_EXTRALIB += -lgomp + endif +endif .PHONY : install .NOTPARALLEL : install @@ -45,7 +53,22 @@ install : lib.grd ifndef NO_CBLAS @echo Generating cblas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) - @sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h" + @cp cblas.h cblas.tmp +ifdef SYMBOLPREFIX + @sed 's/cblas[^( ]*/$(SYMBOLPREFIX)&/g' cblas.tmp > cblas.tmp2 + @sed 's/openblas[^( ]*/$(SYMBOLPREFIX)&/g' cblas.tmp2 > cblas.tmp + #change back any openblas_complex_float and double that got hit + @sed 's/$(SYMBOLPREFIX)openblas_complex_/openblas_complex_/g' cblas.tmp > cblas.tmp2 + @sed 's/goto[^( ]*/$(SYMBOLPREFIX)&/g' cblas.tmp2 > cblas.tmp +endif +ifdef SYMBOLSUFFIX + @sed 's/cblas[^( ]*/&$(SYMBOLSUFFIX)/g' cblas.tmp > cblas.tmp2 + @sed 's/openblas[^( ]*/&$(SYMBOLSUFFIX)/g' cblas.tmp2 > cblas.tmp + #change back any openblas_complex_float and double that got hit + @sed 's/\(openblas_complex_\)\([^ ]*\)$(SYMBOLSUFFIX)/\1\2 /g' cblas.tmp > cblas.tmp2 + @sed 's/goto[^( ]*/&$(SYMBOLSUFFIX)/g' cblas.tmp2 > cblas.tmp +endif + @sed 's/common/openblas_config/g' cblas.tmp > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h" endif ifneq ($(OSNAME), AIX) @@ -132,7 +155,7 @@ endif @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" - @echo 'extralib='$(EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" + @echo 'extralib='$(PKG_EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @@ -168,4 +191,3 @@ endif @echo " endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo Install OK! - diff --git a/Makefile.power b/Makefile.power index 24d8aa8a7..37a02d692 100644 --- a/Makefile.power +++ b/Makefile.power @@ -9,23 +9,63 @@ else USE_OPENMP = 1 endif +ifeq ($(CORE), POWER10) +COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math +FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math +endif + ifeq ($(CORE), POWER9) -ifeq ($(USE_OPENMP), 1) -COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp -FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp +ifneq ($(C_COMPILER), PGI) +CCOMMON_OPT += -Ofast -mvsx -fno-fast-math +ifneq ($(GCCVERSIONGT4), 1) +$(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended) +CCOMMON_OPT += -mcpu=power8 -mtune=power8 else -COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -fno-fast-math -FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -fno-fast-math +CCOMMON_OPT += -mcpu=power9 -mtune=power9 +endif +else +CCOMMON_OPT += -fast -Mvect=simd -Mcache_align +endif +ifneq ($(F_COMPILER), PGI) +FCOMMON_OPT += -O2 -frecursive -fno-fast-math +ifneq ($(GCCVERSIONGT4), 1) +$(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended) +FCOMMON_OPT += -mcpu=power8 -mtune=power8 +else +FCOMMON_OPT += -mcpu=power9 -mtune=power9 +endif +else +FCOMMON_OPT += -O2 -Mrecursive endif endif ifeq ($(CORE), POWER8) -ifeq ($(USE_OPENMP), 1) -COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp -FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp +ifneq ($(C_COMPILER), PGI) +CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math else -COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -fno-fast-math -FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math +CCOMMON_OPT += -fast -Mvect=simd -Mcache_align +endif +ifneq ($(F_COMPILER), PGI) +ifeq ($(OSNAME), AIX) +FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math +else +FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math +endif +else +FCOMMON_OPT += -O2 -Mrecursive +endif +endif + +ifeq ($(USE_OPENMP), 1) +ifneq ($(C_COMPILER), PGI) +CCOMMON_OPT += -DUSE_OPENMP -fopenmp +else +CCOMMON_OPT += -DUSE_OPENMP -mp +endif +ifneq ($(F_COMPILER), PGI) +FCOMMON_OPT += -DUSE_OPENMP -fopenmp +else +FCOMMON_OPT += -DUSE_OPENMP -mp endif endif @@ -68,6 +108,9 @@ CCOMMON_OPT += -mpowerpc64 -maix64 ifeq ($(COMPILER_F77), g77) FCOMMON_OPT += -mpowerpc64 -maix64 endif +ifeq ($(F_COMPILER), GFORTRAN) +FCOMMON_OPT += -mpowerpc64 -maix64 +endif ifeq ($(COMPILER_F77), xlf) FCOMMON_OPT += -q64 endif diff --git a/Makefile.rule b/Makefile.rule index 724a60ec4..2c12177ee 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.9.dev +VERSION = 0.3.10.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library @@ -273,6 +273,9 @@ COMMON_PROF = -pg # # CPP_THREAD_SAFETY_TEST = 1 + +# If you want to enable the experimental BFLOAT16 support +# BUILD_HALF = 1 # # End of user configuration # diff --git a/Makefile.system b/Makefile.system index cc30b05a1..4eec20675 100644 --- a/Makefile.system +++ b/Makefile.system @@ -21,8 +21,14 @@ ifeq ($(ARCH), amd64) override ARCH=x86_64 else ifeq ($(ARCH), powerpc64) override ARCH=power +else ifeq ($(ARCH), powerpc) +override ARCH=power else ifeq ($(ARCH), i386) override ARCH=x86 +else ifeq ($(ARCH), armv6) +override ARCH=arm +else ifeq ($(ARCH), armv7) +override ARCH=arm else ifeq ($(ARCH), aarch64) override ARCH=arm64 else ifeq ($(ARCH), zarch) @@ -86,6 +92,9 @@ endif ifeq ($(TARGET), SKYLAKEX) GETARCH_FLAGS := -DFORCE_NEHALEM endif +ifeq ($(TARGET), COOPERLAKE) +GETARCH_FLAGS := -DFORCE_NEHALEM +endif ifeq ($(TARGET), SANDYBRIDGE) GETARCH_FLAGS := -DFORCE_NEHALEM endif @@ -107,6 +116,9 @@ endif ifeq ($(TARGET), ARMV8) GETARCH_FLAGS := -DFORCE_ARMV7 endif +ifeq ($(TARGET), POWER8) +GETARCH_FLAGS := -DFORCE_POWER6 +endif endif @@ -125,6 +137,9 @@ endif ifeq ($(TARGET_CORE), SKYLAKEX) GETARCH_FLAGS := -DFORCE_NEHALEM endif +ifeq ($(TARGET_CORE), COOPERLAKE) +GETARCH_FLAGS := -DFORCE_NEHALEM +endif ifeq ($(TARGET_CORE), SANDYBRIDGE) GETARCH_FLAGS := -DFORCE_NEHALEM endif @@ -266,10 +281,10 @@ endif ARFLAGS = CPP = $(COMPILER) -E -AR = $(CROSS_SUFFIX)ar -AS = $(CROSS_SUFFIX)as -LD = $(CROSS_SUFFIX)ld -RANLIB = $(CROSS_SUFFIX)ranlib +AR ?= $(CROSS_SUFFIX)ar +AS ?= $(CROSS_SUFFIX)as +LD ?= $(CROSS_SUFFIX)ld +RANLIB ?= $(CROSS_SUFFIX)ranlib NM = $(CROSS_SUFFIX)nm DLLWRAP = $(CROSS_SUFFIX)dllwrap OBJCOPY = $(CROSS_SUFFIX)objcopy @@ -282,6 +297,26 @@ NO_LAPACK = 1 override FEXTRALIB = endif +ifeq ($(C_COMPILER), GCC) +GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) +GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) +GCCVERSIONEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` = 5) +GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) +GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) +GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) +GCCVERSIONGTEQ11 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 11) +GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10) +# Note that the behavior of -dumpversion is compile-time-configurable for +# gcc-7.x and newer. Use -dumpfullversion there +ifeq ($(GCCVERSIONGTEQ7),1) + GCCDUMPVERSION_PARAM := -dumpfullversion +else + GCCDUMPVERSION_PARAM := -dumpversion +endif +GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 2) +GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7) +endif + # # OS dependent settings # @@ -328,13 +363,7 @@ ifeq ($(C_COMPILER), CLANG) CCOMMON_OPT += -DMS_ABI endif -ifeq ($(C_COMPILER), GCC) #Version tests for supporting specific features (MS_ABI, POWER9 intrinsics) -GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) -GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) -GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) -GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) -GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) ifeq ($(GCCVERSIONGT4), 1) # GCC Major version > 4 # It is compatible with MSVC ABI. @@ -348,7 +377,6 @@ ifeq ($(GCCMINORVERSIONGTEQ7), 1) CCOMMON_OPT += -DMS_ABI endif endif -endif # Ensure the correct stack alignment on Win32 # http://permalink.gmane.org/gmane.comp.lib.openblas.general/97 @@ -540,7 +568,7 @@ DYNAMIC_CORE += HASWELL ZEN endif ifneq ($(NO_AVX512), 1) ifneq ($(NO_AVX2), 1) -DYNAMIC_CORE += SKYLAKEX +DYNAMIC_CORE += SKYLAKEX COOPERLAKE endif endif endif @@ -565,11 +593,38 @@ DYNAMIC_CORE += THUNDERX DYNAMIC_CORE += THUNDERX2T99 DYNAMIC_CORE += TSV110 DYNAMIC_CORE += EMAG8180 +DYNAMIC_CORE += THUNDERX3T110 endif ifeq ($(ARCH), zarch) -DYNAMIC_CORE = Z13 +DYNAMIC_CORE = ZARCH_GENERIC + +# Z13 is supported since gcc-5.2, gcc-6, and in RHEL 7.3 and newer +ifeq ($(GCCVERSIONGT5), 1) + ZARCH_SUPPORT_Z13 := 1 +else ifeq ($(GCCVERSIONEQ5), 1) +ifeq ($(GCCMINORVERSIONGTEQ2), 1) + ZARCH_SUPPORT_Z13 := 1 +endif +endif + +ifeq ($(wildcard /etc/redhat-release), /etc/redhat-release) +ifeq ($(shell source /etc/os-release ; expr $$VERSION_ID \>= "7.3"), 1) + ZARCH_SUPPORT_Z13 := 1 +endif +endif + +ifeq ($(ZARCH_SUPPORT_Z13), 1) +DYNAMIC_CORE += Z13 +else +$(info OpenBLAS: Not building Z13 kernels because gcc is older than 5.2 or 6.x) +endif + +ifeq ($(GCCVERSIONGTEQ7), 1) DYNAMIC_CORE += Z14 +else +$(info OpenBLAS: Not building Z14 kernels because gcc is older than 7.x) +endif endif ifeq ($(ARCH), power) @@ -577,14 +632,23 @@ DYNAMIC_CORE = POWER6 DYNAMIC_CORE += POWER8 ifneq ($(C_COMPILER), GCC) DYNAMIC_CORE += POWER9 +DYNAMIC_CORE += POWER10 endif ifeq ($(C_COMPILER), GCC) -GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) ifeq ($(GCCVERSIONGT5), 1) DYNAMIC_CORE += POWER9 else $(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.) endif +ifeq ($(GCCVERSIONGTEQ11), 1) +DYNAMIC_CORE += POWER10 +else ifeq ($(GCCVERSIONGTEQ10), 1) +ifeq ($(GCCMINORVERSIONGTEQ2), 1) +DYNAMIC_CORE += POWER10 +endif +else +$(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.) +endif endif endif @@ -745,8 +809,19 @@ endif ifeq ($(C_COMPILER), PGI) ifdef BINARY64 +ifeq ($(ARCH), x86_64) CCOMMON_OPT += -tp p7-64 -D__MMX__ -Mnollvm else +ifeq ($(ARCH), power) +ifeq ($(CORE), POWER8) +CCOMMON_OPT += -tp pwr8 +endif +ifeq ($(CORE), POWER9) +CCOMMON_OPT += -tp pwr9 +endif +endif +endif +else CCOMMON_OPT += -tp p7 endif endif @@ -765,6 +840,15 @@ endif ifeq ($(F_COMPILER), FLANG) CCOMMON_OPT += -DF_INTERFACE_FLANG +FCOMMON_OPT += -Mrecursive -Kieee +ifeq ($(OSNAME), Linux) +ifeq ($(ARCH), x86_64) +FLANG_VENDOR := $(shell expr `$(FC) --version|cut -f 1 -d "."|head -1`) +ifeq ($(FLANG_VENDOR),AOCC) +FCOMMON_OPT += -fno-unroll-loops +endif +endif +endif ifdef BINARY64 ifdef INTERFACE64 ifneq ($(INTERFACE64), 0) @@ -860,7 +944,7 @@ ifneq ($(INTERFACE64), 0) FCOMMON_OPT += -i8 endif endif -FCOMMON_OPT += -recursive +FCOMMON_OPT += -recursive -fp-model strict -assume protect-parens ifeq ($(USE_OPENMP), 1) FCOMMON_OPT += -fopenmp endif @@ -900,8 +984,19 @@ ifneq ($(INTERFACE64), 0) FCOMMON_OPT += -i8 endif endif +ifeq ($(ARCH), x86_64) FCOMMON_OPT += -tp p7-64 else +ifeq ($(ARCH), power) +ifeq ($(CORE), POWER8) +FCOMMON_OPT += -tp pwr8 +endif +ifeq ($(CORE), POWER9) +FCOMMON_OPT += -tp pwr9 +endif +endif +endif +else FCOMMON_OPT += -tp p7 endif FCOMMON_OPT += -Mrecursive @@ -1129,6 +1224,10 @@ ifeq ($(USE_TLS), 1) CCOMMON_OPT += -DUSE_TLS endif +ifeq ($(BUILD_HALF), 1) +CCOMMON_OPT += -DBUILD_HALF +endif + CCOMMON_OPT += -DVERSION=\"$(VERSION)\" ifndef SYMBOLPREFIX @@ -1155,6 +1254,9 @@ KERNELDIR = $(TOPDIR)/kernel/$(ARCH) include $(TOPDIR)/Makefile.$(ARCH) +ifneq ($(C_COMPILER), PGI) +CCOMMON_OPT += -UASMNAME -UASMFNAME -UNAME -UCNAME -UCHAR_NAME -UCHAR_CNAME +endif CCOMMON_OPT += -DASMNAME=$(FU)$(*F) -DASMFNAME=$(FU)$(*F)$(BU) -DNAME=$(*F)$(BU) -DCNAME=$(*F) -DCHAR_NAME=\"$(*F)$(BU)\" -DCHAR_CNAME=\"$(*F)\" ifeq ($(CORE), PPC440) @@ -1247,7 +1349,6 @@ endif override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF) - override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF) #MAKEOVERRIDES = @@ -1354,6 +1455,7 @@ export ARCH export CORE export LIBCORE export __BYTE_ORDER__ +export ELF_VERSION export PGCPATH export CONFIG export CC @@ -1399,6 +1501,7 @@ export KERNELDIR export FUNCTION_PROFILE export TARGET_CORE export NO_AVX512 +export BUILD_HALF export SHGEMM_UNROLL_M export SHGEMM_UNROLL_N diff --git a/Makefile.x86_64 b/Makefile.x86_64 index f2de51ef4..00975b25a 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -27,18 +27,54 @@ endif endif endif +ifeq ($(CORE), COOPERLAKE) +ifndef DYNAMIC_ARCH +ifndef NO_AVX512 +ifeq ($(C_COMPILER), GCC) +# cooperlake support was added in 10.1 +GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10) +GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 1) +ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11) +CCOMMON_OPT += -march=cooperlake +FCOMMON_OPT += -march=cooperlake +endif +endif +ifeq ($(OSNAME), CYGWIN_NT) +CCOMMON_OPT += -fno-asynchronous-unwind-tables +FCOMMON_OPT += -fno-asynchronous-unwind-tables +endif +ifeq ($(OSNAME), WINNT) +ifeq ($(C_COMPILER), GCC) +CCOMMON_OPT += -fno-asynchronous-unwind-tables +FCOMMON_OPT += -fno-asynchronous-unwind-tables +endif +endif +endif +endif +endif + ifeq ($(CORE), HASWELL) ifndef DYNAMIC_ARCH ifndef NO_AVX2 ifeq ($(C_COMPILER), GCC) +# AVX2 support was added in 4.7.0 +GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) +GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) +ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11) CCOMMON_OPT += -mavx2 endif +endif ifeq ($(F_COMPILER), GFORTRAN) +# AVX2 support was added in 4.7.0 +GCCVERSIONGTEQ4 := $(shell expr `$(FC) -dumpversion | cut -f1 -d.` \>= 4) +GCCMINORVERSIONGTEQ7 := $(shell expr `$(FC) -dumpversion | cut -f2 -d.` \>= 7) +ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11) FCOMMON_OPT += -mavx2 endif endif endif endif +endif diff --git a/Makefile.zarch b/Makefile.zarch index 47ea1eb71..be1e34f6d 100644 --- a/Makefile.zarch +++ b/Makefile.zarch @@ -5,6 +5,6 @@ FCOMMON_OPT += -march=z13 -mzvector endif ifeq ($(CORE), Z14) -CCOMMON_OPT += -march=z14 -mzvector +CCOMMON_OPT += -march=z14 -mzvector -O3 FCOMMON_OPT += -march=z14 -mzvector endif diff --git a/README.md b/README.md index 6dc3c7b42..f8226f5cb 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,8 @@ You can download them from [file hosting on sourceforge.net](https://sourceforge ## Installation from Source Download from project homepage, https://xianyi.github.com/OpenBLAS/, or check out the code -using Git from https://github.com/xianyi/OpenBLAS.git. +using Git from https://github.com/xianyi/OpenBLAS.git. (If you want the most up to date version, be +sure to use the develop branch - master is several years out of date due to a change of maintainership.) Buildtime parameters can be chosen in Makefile.rule, see there for a short description of each option. Most can also be given directly on the make or cmake command line. @@ -58,6 +59,10 @@ Examples: ```sh make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A ``` + or same with the newer mips-crosscompiler put out by Loongson that defaults to the 32bit ABI: + ```sh + make HOSTCC=gcc CC='/opt/mips-loongson-gcc7.3-linux-gnu/2019.06-29/bin/mips-linux-gnu-gcc -mabi=64' FC='/opt/mips-loongson-gcc7.3-linux-gnu/2019.06-29/bin/mips-linux-gnu-gfortran -mabi=64' TARGET=LOONGSON3A + ``` * On an x86 box, compile this library for a loongson3a CPU with loongcc (based on Open64) compiler: ```sh diff --git a/TargetList.txt b/TargetList.txt index e2d2f4026..5934f3012 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -22,6 +22,7 @@ SANDYBRIDGE HASWELL SKYLAKEX ATOM +COOPERLAKE b)AMD CPU: ATHLON @@ -49,6 +50,7 @@ POWER6 POWER7 POWER8 POWER9 +POWER10 PPCG4 PPC970 PPC970MP @@ -95,6 +97,7 @@ FALKOR THUNDERX THUNDERX2T99 TSV110 +THUNDERX3T110 9.System Z: ZARCH_GENERIC diff --git a/benchmark/Makefile b/benchmark/Makefile index 90d903ad7..2f70ceaf3 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -49,6 +49,12 @@ else GOTO_LAPACK_TARGETS= endif +ifeq ($(BUILD_HALF),1) +GOTO_HALF_TARGETS=shgemm.goto +else +GOTO_HALF_TARGETS= +endif + ifeq ($(OSNAME), WINNT) goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ @@ -91,7 +97,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \ ssymm.goto dsymm.goto csymm.goto zsymm.goto \ - saxpby.goto daxpby.goto caxpby.goto zaxpby.goto + saxpby.goto daxpby.goto caxpby.goto zaxpby.goto $(GOTO_HALF_TARGETS) acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ @@ -264,7 +270,7 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ samin.goto damin.goto camin.goto zamin.goto \ smin.goto dmin.goto \ saxpby.goto daxpby.goto caxpby.goto zaxpby.goto \ - snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) + snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) $(GOTO_HALF_TARGETS) acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ @@ -614,6 +620,11 @@ zcholesky.essl : zcholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Sgemm #################################################### +ifeq ($(BUILD_HALF),1) +shgemm.goto : shgemm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm +endif + sgemm.goto : sgemm.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -1814,7 +1825,7 @@ zsymv.veclib : zsymv.$(SUFFIX) ##################################### Sgeev #################################################### sgeev.goto : sgeev.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm sgeev.acml : sgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1830,7 +1841,7 @@ sgeev.veclib : sgeev.$(SUFFIX) ##################################### Dgeev #################################################### dgeev.goto : dgeev.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dgeev.acml : dgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1847,7 +1858,7 @@ dgeev.veclib : dgeev.$(SUFFIX) ##################################### Cgeev #################################################### cgeev.goto : cgeev.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm cgeev.acml : cgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1864,7 +1875,7 @@ cgeev.veclib : cgeev.$(SUFFIX) ##################################### Zgeev #################################################### zgeev.goto : zgeev.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zgeev.acml : zgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1880,7 +1891,7 @@ zgeev.veclib : zgeev.$(SUFFIX) ##################################### Sgetri #################################################### sgetri.goto : sgetri.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm sgetri.acml : sgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1896,7 +1907,7 @@ sgetri.veclib : sgetri.$(SUFFIX) ##################################### Dgetri #################################################### dgetri.goto : dgetri.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dgetri.acml : dgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1913,7 +1924,7 @@ dgetri.veclib : dgetri.$(SUFFIX) ##################################### Cgetri #################################################### cgetri.goto : cgetri.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm cgetri.acml : cgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1930,7 +1941,7 @@ cgetri.veclib : cgetri.$(SUFFIX) ##################################### Zgetri #################################################### zgetri.goto : zgetri.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zgetri.acml : zgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -2916,6 +2927,11 @@ ccholesky.$(SUFFIX) : cholesky.c zcholesky.$(SUFFIX) : cholesky.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +ifeq ($(BUILD_HALF),1) +shgemm.$(SUFFIX) : gemm.c + $(CC) $(CFLAGS) -c -DHALF -UCOMPLEX -UDOUBLE -o $(@F) $^ +endif + sgemm.$(SUFFIX) : gemm.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ diff --git a/benchmark/gemm.c b/benchmark/gemm.c index dd016a7c3..d2235330b 100644 --- a/benchmark/gemm.c +++ b/benchmark/gemm.c @@ -39,6 +39,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef DOUBLE #define GEMM BLASFUNC(dgemm) +#elif defined(HALF) +#define GEMM BLASFUNC(shgemm) #else #define GEMM BLASFUNC(sgemm) #endif @@ -120,7 +122,8 @@ static void *huge_malloc(BLASLONG size){ int main(int argc, char *argv[]){ - FLOAT *a, *b, *c; + IFLOAT *a, *b; + FLOAT *c; FLOAT alpha[] = {1.0, 0.0}; FLOAT beta [] = {0.0, 0.0}; char transa = 'N'; @@ -184,10 +187,10 @@ int main(int argc, char *argv[]){ k = to; } - if (( a = (FLOAT *)malloc(sizeof(FLOAT) * m * k * COMPSIZE)) == NULL) { + if (( a = (IFLOAT *)malloc(sizeof(IFLOAT) * m * k * COMPSIZE)) == NULL) { fprintf(stderr,"Out of Memory!!\n");exit(1); } - if (( b = (FLOAT *)malloc(sizeof(FLOAT) * k * n * COMPSIZE)) == NULL) { + if (( b = (IFLOAT *)malloc(sizeof(IFLOAT) * k * n * COMPSIZE)) == NULL) { fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( c = (FLOAT *)malloc(sizeof(FLOAT) * m * n * COMPSIZE)) == NULL) { @@ -199,10 +202,10 @@ int main(int argc, char *argv[]){ #endif for (i = 0; i < m * k * COMPSIZE; i++) { - a[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[i] = ((IFLOAT) rand() / (IFLOAT) RAND_MAX) - 0.5; } for (i = 0; i < k * n * COMPSIZE; i++) { - b[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + b[i] = ((IFLOAT) rand() / (IFLOAT) RAND_MAX) - 0.5; } for (i = 0; i < m * n * COMPSIZE; i++) { c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; diff --git a/benchmark/zdot.c b/benchmark/zdot.c index ed9d4d2e8..136135c9c 100644 --- a/benchmark/zdot.c +++ b/benchmark/zdot.c @@ -170,9 +170,11 @@ int main(int argc, char *argv[]){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } gettimeofday( &start, (struct timezone *)0); - +#ifdef RETURN_BY_STACK + DOT (&result , &m, x, &inc_x, y, &inc_y ); +#else result = DOT (&m, x, &inc_x, y, &inc_y ); - +#endif gettimeofday( &stop, (struct timezone *)0); time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; diff --git a/c_check b/c_check index c7899c84f..314c2b157 100644 --- a/c_check +++ b/c_check @@ -6,6 +6,7 @@ # Checking cross compile $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); $hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch); +$hostarch = `uname -p` if ($hostos eq "AIX"); $hostarch = "x86_64" if ($hostarch eq "amd64"); $hostarch = "arm" if ($hostarch =~ /^arm.*/); $hostarch = "arm64" if ($hostarch eq "aarch64"); @@ -248,6 +249,28 @@ if (($architecture eq "x86") || ($architecture eq "x86_64")) { } } +$c11_atomics = 0; +if ($data =~ /HAVE_C11/) { + eval "use File::Temp qw(tempfile)"; + if ($@){ + warn "could not load PERL module File::Temp, so could not check compiler compatibility with C11"; + $c11_atomics = 0; + } else { + ($fh,$tmpf) = tempfile( SUFFIX => '.c' , UNLINK => 1 ); + print $tmpf "#include \nint main(void){}\n"; + $args = " -c -o $tmpf.o $tmpf"; + my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null"); + system(@cmd) == 0; + if ($? != 0) { + $c11_atomics = 0; + } else { + $c11_atomics = 1; + } + unlink("$tmpf.o"); + } +} + + $data = `$compiler_name $flags -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`; $data =~ /globl\s([_\.]*)(.*)/; @@ -310,6 +333,7 @@ $linker_a = ""; && ($flags !~ /advapi32/) && ($flags !~ /shell32/) && ($flags !~ /omp/) + && ($flags !~ /[0-9]+/) ) { $linker_l .= $flags . " " } @@ -350,6 +374,8 @@ print CONFFILE "#define __32BIT__\t1\n" if $binformat eq bin32; print CONFFILE "#define __64BIT__\t1\n" if $binformat eq bin64; print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne ""; print CONFFILE "#define HAVE_MSA\t1\n" if $have_msa eq 1; +print CONFFILE "#define HAVE_C11\t1\n" if $c11_atomics eq 1; + if ($os eq "LINUX") { diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 9d51f777c..c00f8fe71 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -45,11 +45,11 @@ endif () if (DYNAMIC_ARCH) if (ARM64) - set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1) + set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) endif () if (POWER) - set(DYNAMIC_CORE POWER6 POWER8 POWER9) + set(DYNAMIC_CORE POWER6 POWER8 POWER9 POWER10) endif () if (X86) @@ -76,9 +76,9 @@ if (DYNAMIC_ARCH) set(DYNAMIC_CORE ${DYNAMIC_CORE} HASWELL ZEN) endif () if (NOT NO_AVX512) - set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX) + set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX COOPERLAKE) string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") - endif () + endif () if (DYNAMIC_LIST) set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST}) endif () diff --git a/cmake/cc.cmake b/cmake/cc.cmake index d5551147c..c490dd9ab 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -103,3 +103,16 @@ if (${CORE} STREQUAL "SKYLAKEX") endif () endif () endif () + +if (${CORE} STREQUAL "COOPERLAKE") + if (NOT DYNAMIC_ARCH) + if (NOT NO_AVX512) + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1) + set (CCOMMON_OPT = "${CCOMMON_OPT} -march=cooperlake") + else () + set (CCOMMON_OPT "${CCOMMON_OPT} -march=skylake-avx512") + endif() + endif () + endif () +endif () diff --git a/cmake/f_check.cmake b/cmake/f_check.cmake index f877fc3e1..0f5d0e15d 100644 --- a/cmake/f_check.cmake +++ b/cmake/f_check.cmake @@ -21,7 +21,15 @@ # NEED2UNDERSCORES if (NOT NO_LAPACK) - enable_language(Fortran) + include(CheckLanguage) + check_language(Fortran) + if(CMAKE_Fortran_COMPILER) + enable_language(Fortran) + else() + message(STATUS "No Fortran compiler found, can build only BLAS but not LAPACK") + set (NOFORTRAN 1) + set (NO_LAPACK 1) + endif() else() include(CMakeForceCompiler) CMAKE_FORCE_Fortran_COMPILER(gfortran GNU) diff --git a/cmake/fc.cmake b/cmake/fc.cmake index cc330ae2c..fc1f9bb22 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -16,6 +16,7 @@ if (${F_COMPILER} STREQUAL "FLANG") if (USE_OPENMP) set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp") endif () + set(FCOMMON_OPT "${FCOMMON_OPT} -Mrecursive -Kieee") endif () if (${F_COMPILER} STREQUAL "G77") diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake index 1c1fed571..4b505a102 100644 --- a/cmake/kernel.cmake +++ b/cmake/kernel.cmake @@ -113,6 +113,7 @@ macro(SetDefaultL1) set(ZSUMKERNEL zsum.S) set(QSUMKERNEL sum.S) set(XSUMKERNEL zsum.S) +if (BUILD_HALF) set(SHAMINKERNEL ../arm/amin.c) set(SHAMAXKERNEL ../arm/amax.c) set(SHMAXKERNEL ../arm/max.c) @@ -131,6 +132,7 @@ macro(SetDefaultL1) set(SHNRM2KERNEL ../arm/nrm2.c) set(SHSUMKERNEL ../arm/sum.c) set(SHSWAPKERNEL ../arm/swap.c) +endif () endmacro () macro(SetDefaultL2) @@ -179,10 +181,11 @@ macro(SetDefaultL2) set(XHEMV_L_KERNEL ../generic/zhemv_k.c) set(XHEMV_V_KERNEL ../generic/zhemv_k.c) set(XHEMV_M_KERNEL ../generic/zhemv_k.c) +if (BUILD_HALF) set(SHGEMVNKERNEL ../arm/gemv_n.c) set(SHGEMVTKERNEL ../arm/gemv_t.c) set(SHGERKERNEL ../generic/ger.c) - +endif () endmacro () macro(SetDefaultL3) @@ -190,6 +193,7 @@ macro(SetDefaultL3) set(DGEADD_KERNEL ../generic/geadd.c) set(CGEADD_KERNEL ../generic/zgeadd.c) set(ZGEADD_KERNEL ../generic/zgeadd.c) +if (BUILD_HALF) set(SHGEADD_KERNEL ../generic/geadd.c) set(SHGEMMKERNEL ../generic/gemmkernel_2x2.c) set(SHGEMM_BETA ../generic/gemm_beta.c) @@ -201,6 +205,6 @@ macro(SetDefaultL3) set(SHGEMMITCOPYOBJ shgemm_itcopy.o) set(SHGEMMONCOPYOBJ shgemm_oncopy.o) set(SHGEMMOTCOPYOBJ shgemm_otcopy.o) - +endif () endmacro () diff --git a/cmake/openblas.pc.in b/cmake/openblas.pc.in index df4b2ab06..0bd49f996 100644 --- a/cmake/openblas.pc.in +++ b/cmake/openblas.pc.in @@ -7,5 +7,5 @@ Name: OpenBLAS Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version Version: @OPENBLAS_VERSION@ URL: https://github.com/xianyi/OpenBLAS -Libs: -L${libdir} -lopenblas${libsuffix} +Libs: @OpenMP_C_FLAGS@ -L${libdir} -lopenblas${libsuffix} Cflags: -I${includedir} diff --git a/cmake/os.cmake b/cmake/os.cmake index 2d25e7aaa..c644bc3f7 100644 --- a/cmake/os.cmake +++ b/cmake/os.cmake @@ -8,7 +8,7 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux") set(NO_EXPRECISION 1) endif () -if (${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD|OpenBSD|NetBSD|DragonFly") +if (${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD|OpenBSD|NetBSD|DragonFly|Darwin") set(EXTRALIB "${EXTRALIB} -lm") set(NO_EXPRECISION 1) endif () diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index e0696093b..3b2a9d6a2 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -195,8 +195,13 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS "#define HAVE_VFP\n" "#define HAVE_NEON\n" "#define ARMV8\n") +if ("${TCORE}" STREQUAL "CORTEXA57") set(SGEMM_UNROLL_M 16) set(SGEMM_UNROLL_N 4) +else () + set(SGEMM_UNROLL_M 8) + set(SGEMM_UNROLL_N 8) +endif () set(DGEMM_UNROLL_M 8) set(DGEMM_UNROLL_N 4) set(CGEMM_UNROLL_M 8) @@ -338,6 +343,33 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_N 4) set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "THUNDERX3T110") + file(APPEND ${TARGET_CONF_TEMP} + "#define THUNDERX3T110\n" + "#define L1_CODE_SIZE\t65536\n" + "#define L1_CODE_LINESIZE\t64\n" + "#define L1_CODE_ASSOCIATIVE\t8\n" + "#define L1_DATA_SIZE\t65536\n" + "#define L1_DATA_LINESIZE\t64\n" + "#define L1_DATA_ASSOCIATIVE\t8\n" + "#define L2_SIZE\t524288\n" + "#define L2_LINESIZE\t64\n" + "#define L2_ASSOCIATIVE\t8\n" + "#define L3_SIZE\t94371840\n" + "#define L3_LINESIZE\t64\n" + "#define L3_ASSOCIATIVE\t32\n" + "#define DTB_DEFAULT_ENTRIES\t64\n" + "#define DTB_SIZE\t4096\n" + "#define ARMV8\n") + set(SGEMM_UNROLL_M 16) + set(SGEMM_UNROLL_N 4) + set(DGEMM_UNROLL_M 8) + set(DGEMM_UNROLL_N 4) + set(CGEMM_UNROLL_M 8) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 4) + set(ZGEMM_UNROLL_N 4) + set(SYMV_P 16) elseif ("${TCORE}" STREQUAL "TSV110") file(APPEND ${TARGET_CONF_TEMP} "#define ARMV8\n" @@ -420,7 +452,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(ZGEMM_UNROLL_M 8) set(ZGEMM_UNROLL_N 2) set(SYMV_P 8) - elseif ("${TCORE}" STREQUAL "POWER9") + elseif ("${TCORE}" STREQUAL "POWER9" OR "${TCORE}" STREQUAL "POWER10") file(APPEND ${TARGET_CONF_TEMP} "#define L1_DATA_SIZE 32768\n" "#define L1_DATA_LINESIZE 128\n" @@ -492,7 +524,7 @@ else(NOT CMAKE_CROSSCOMPILING) if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") try_compile(GETARCH_RESULT ${GETARCH_DIR} SOURCES ${GETARCH_SRC} - COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${GETARCH_DIR} -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}" + COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I"${GETARCH_DIR}" -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}" OUTPUT_VARIABLE GETARCH_LOG COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN} ) @@ -520,7 +552,7 @@ execute_process(COMMAND "${PROJECT_BINARY_DIR}/${GETARCH_BIN}" 1 OUTPUT_VARIABLE if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") try_compile(GETARCH2_RESULT ${GETARCH2_DIR} SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c - COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${GETARCH2_DIR} -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}" + COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I"${GETARCH2_DIR}" -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}" OUTPUT_VARIABLE GETARCH2_LOG COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} ) diff --git a/cmake/system.cmake b/cmake/system.cmake index 65e5aa508..e3617c4e2 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -33,7 +33,7 @@ endif () if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) message(STATUS "Compiling a ${BINARY}-bit binary.") set(NO_AVX 1) - if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX") + if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX" OR ${TARGET} STREQUAL "COOPERLAKE") set(TARGET "NEHALEM") endif () if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN") @@ -45,6 +45,18 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) endif () if (DEFINED TARGET) + if (${TARGET} STREQUAL "COOPERLAKE" AND NOT NO_AVX512) +# if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake") + else() + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") + endif() +# elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG") +# set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") +# endif() + endif() if (${TARGET} STREQUAL "SKYLAKEX" AND NOT NO_AVX512) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") endif() @@ -297,6 +309,16 @@ if (USE_SIMPLE_THREADED_LEVEL3) set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_SIMPLE_THREADED_LEVEL3") endif () +if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows") +if (DEFINED MAX_STACK_ALLOC) +if (NOT ${MAX_STACK_ALLOC} EQUAL 0) +set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_STACK_ALLOC=${MAX_STACK_ALLOC}") +endif () +else () +set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_STACK_ALLOC=2048") +endif () +endif () + if (DEFINED LIBNAMESUFFIX) set(LIBPREFIX "libopenblas_${LIBNAMESUFFIX}") else () @@ -407,6 +429,14 @@ if (${CMAKE_C_COMPILER} STREQUAL "LSB" OR ${CMAKE_SYSTEM_NAME} STREQUAL "Windows set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DLAPACK_COMPLEX_STRUCTURE") endif () +if ("${CMAKE_BUILD_TYPE}" STREQUAL "Release") +if ("${F_COMPILER}" STREQUAL "FLANG") +if (${CMAKE_Fortran_COMPILER_VERSION} VERSION_LESS_EQUAL 3) + set(CMAKE_Fortran_FLAGS_RELEASE "${CMAKE_Fortran_FLAGS_RELEASE} -fno-unroll-loops") +endif () +endif () +endif () + if (NOT DEFINED SUFFIX) set(SUFFIX o) endif () diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index 94eb0a9c6..4382ffc4e 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -116,3 +116,10 @@ set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512") endif() file(REMOVE "avx512.c" "avx512.o") endif() + +include(CheckIncludeFile) +CHECK_INCLUDE_FILE("stdatomic.h" HAVE_C11) +if (HAVE_C11 EQUAL 1) +message (STATUS found stdatomic.h) +set (CCOMMON_OPT "${CCOMMON_OPT} -DHAVE_C11") +endif() diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 831ddffe6..1c21e776e 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -15,12 +15,36 @@ endfunction () # Reads a Makefile into CMake vars. macro(ParseMakefileVars MAKEFILE_IN) message(STATUS "Reading vars from ${MAKEFILE_IN}...") + set (IfElse 0) + set (ElseSeen 0) file(STRINGS ${MAKEFILE_IN} makefile_contents) foreach (makefile_line ${makefile_contents}) +#message(STATUS "parsing ${makefile_line}") + if (${IfElse} GREATER 0) + string(REGEX MATCH "endif[ \t]*" line_match "${makefile_line}") + if (NOT "${line_match}" STREQUAL "") +# message(STATUS "ENDIF ${makefile_line}") + set (IfElse 0) + set (ElseSeen 0) + continue () + endif () + string(REGEX MATCH "else[ \t]*" line_match "${makefile_line}") + if (NOT "${line_match}" STREQUAL "") +# message(STATUS "ELSE ${makefile_line}") + set (ElseSeen 1) + continue () + endif() + if ( (${IfElse} EQUAL 2 AND ${ElseSeen} EQUAL 0) OR ( ${IfElse} EQUAL 1 AND ${ElseSeen} EQUAL 1)) +# message(STATUS "skipping ${makefile_line}") + continue () + endif () + endif () string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*(.+)$" line_match "${makefile_line}") if (NOT "${line_match}" STREQUAL "") +#message(STATUS "match on ${line_match}") set(var_name ${CMAKE_MATCH_1}) - set(var_value ${CMAKE_MATCH_2}) +# set(var_value ${CMAKE_MATCH_2}) + string(STRIP ${CMAKE_MATCH_2} var_value) # check for Makefile variables in the string, e.g. $(TSUFFIX) string(REGEX MATCHALL "\\$\\(([0-9_a-zA-Z]+)\\)" make_var_matches ${var_value}) foreach (make_var ${make_var_matches}) @@ -33,7 +57,31 @@ macro(ParseMakefileVars MAKEFILE_IN) else () string(REGEX MATCH "include \\$\\(KERNELDIR\\)/(.+)$" line_match "${makefile_line}") if (NOT "${line_match}" STREQUAL "") +#message(STATUS "match on include ${line_match}") ParseMakefileVars(${KERNELDIR}/${CMAKE_MATCH_1}) + else () +# message(STATUS "unmatched line ${line_match}") + string(REGEX MATCH "ifeq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}") + if (NOT "${line_match}" STREQUAL "") +# message(STATUS "IFEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}") + if (DEFINED ${${CMAKE_MATCH_1}} AND ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2}) +# message (STATUS "condition is true") + set (IfElse 1) + else () + set (IfElse 2) + endif () + else () + string(REGEX MATCH "ifneq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}") + if (NOT "${line_match}" STREQUAL "") +# message(STATUS "IFNEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}") + if (NOT ( ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2})) +# message (STATUS "condition is true") + set (IfElse 1) + else () + set (IfElse 2) + endif () + endif () + endif () endif () endif () endforeach () diff --git a/common.h b/common.h index e2c8cdee5..d6637abe4 100644 --- a/common.h +++ b/common.h @@ -360,13 +360,8 @@ typedef int blasint; #endif #endif -#ifdef POWER8 -#ifndef YIELDING -#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); -#endif -#endif -#ifdef POWER9 +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #ifndef YIELDING #define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); #endif @@ -686,7 +681,7 @@ __declspec(dllimport) int __cdecl omp_in_parallel(void); __declspec(dllimport) int __cdecl omp_get_num_procs(void); #endif -#if (__STDC_VERSION__ >= 201112L) +#ifdef HAVE_C11 #if defined(C_GCC) && ( __GNUC__ < 7) // workaround for GCC bug 65467 #ifndef _Atomic diff --git a/common_level3.h b/common_level3.h index 6d5cfa65d..88a521b73 100644 --- a/common_level3.h +++ b/common_level3.h @@ -47,12 +47,12 @@ __global__ void cuda_dgemm_kernel(int, int, int, double *, double *, double *); extern "C" { #endif -extern void sgemm_kernel_direct(BLASLONG M, BLASLONG N, BLASLONG K, +void sgemm_direct(BLASLONG M, BLASLONG N, BLASLONG K, float * A, BLASLONG strideA, float * B, BLASLONG strideB, float * R, BLASLONG strideR); -extern int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K); +int sgemm_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K); int shgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, diff --git a/common_mips.h b/common_mips.h index dd2f8d558..7dc3ba246 100644 --- a/common_mips.h +++ b/common_mips.h @@ -94,7 +94,7 @@ REALNAME: #endif #define HUGE_PAGESIZE ( 4 << 20) -#define BUFFER_SIZE (16 << 20) +#define BUFFER_SIZE (16 << 21) #define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) diff --git a/common_mips64.h b/common_mips64.h index af638d60c..a06edfe08 100644 --- a/common_mips64.h +++ b/common_mips64.h @@ -227,7 +227,7 @@ REALNAME: ;\ #define SEEK_ADDRESS -#define BUFFER_SIZE ( 32 << 20) +#define BUFFER_SIZE ( 32 << 21) #if defined(LOONGSON3A) #define PAGESIZE (16UL << 10) diff --git a/common_param.h b/common_param.h index 19a34fa3d..0437482dc 100644 --- a/common_param.h +++ b/common_param.h @@ -47,7 +47,7 @@ typedef struct { int dtb_entries; int offsetA, offsetB, align; -#if 1 +#ifdef BUILD_HALF int shgemm_p, shgemm_q, shgemm_r; int shgemm_unroll_m, shgemm_unroll_n, shgemm_unroll_mn; @@ -175,6 +175,11 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*ssymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*ssymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +#ifdef ARCH_X86_64 + void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG); + int (*sgemm_direct_performant) (BLASLONG M, BLASLONG N, BLASLONG K); +#endif + int (*sgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); int (*sgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); @@ -1002,12 +1007,14 @@ extern gotoblas_t *gotoblas; #define HAVE_EX_L2 gotoblas -> exclusive_cache +#ifdef BUILD_HALF #define SHGEMM_P gotoblas -> shgemm_p #define SHGEMM_Q gotoblas -> shgemm_q #define SHGEMM_R gotoblas -> shgemm_r #define SHGEMM_UNROLL_M gotoblas -> shgemm_unroll_m #define SHGEMM_UNROLL_N gotoblas -> shgemm_unroll_n #define SHGEMM_UNROLL_MN gotoblas -> shgemm_unroll_mn +#endif #define SGEMM_P gotoblas -> sgemm_p #define SGEMM_Q gotoblas -> sgemm_q @@ -1086,6 +1093,7 @@ extern gotoblas_t *gotoblas; #define HAVE_EX_L2 0 #endif +#ifdef BUILD_HALF #define SHGEMM_P SHGEMM_DEFAULT_P #define SHGEMM_Q SHGEMM_DEFAULT_Q #define SHGEMM_R SHGEMM_DEFAULT_R @@ -1096,6 +1104,7 @@ extern gotoblas_t *gotoblas; #else #define SHGEMM_UNROLL_MN MAX((SHGEMM_UNROLL_M), (SHGEMM_UNROLL_N)) #endif +#endif #define SGEMM_P SGEMM_DEFAULT_P #define SGEMM_Q SGEMM_DEFAULT_Q @@ -1330,31 +1339,31 @@ extern gotoblas_t *gotoblas; #endif #ifndef SHGEMM_DEFAULT_R -#define SHGEMM_DEFAULT_R (((BUFFER_SIZE - ((SHGEMM_DEFAULT_P * SHGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SHGEMM_DEFAULT_Q * 4) - 15) & ~15) +#define SHGEMM_DEFAULT_R (((BUFFER_SIZE - ((SHGEMM_DEFAULT_P * SHGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SHGEMM_DEFAULT_Q * 4) - 15) & ~15UL) #endif #ifndef SGEMM_DEFAULT_R -#define SGEMM_DEFAULT_R (((BUFFER_SIZE - ((SGEMM_DEFAULT_P * SGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SGEMM_DEFAULT_Q * 4) - 15) & ~15) +#define SGEMM_DEFAULT_R (((BUFFER_SIZE - ((SGEMM_DEFAULT_P * SGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SGEMM_DEFAULT_Q * 4) - 15) & ~15UL) #endif #ifndef DGEMM_DEFAULT_R -#define DGEMM_DEFAULT_R (((BUFFER_SIZE - ((DGEMM_DEFAULT_P * DGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (DGEMM_DEFAULT_Q * 8) - 15) & ~15) +#define DGEMM_DEFAULT_R (((BUFFER_SIZE - ((DGEMM_DEFAULT_P * DGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (DGEMM_DEFAULT_Q * 8) - 15) & ~15UL) #endif #ifndef QGEMM_DEFAULT_R -#define QGEMM_DEFAULT_R (((BUFFER_SIZE - ((QGEMM_DEFAULT_P * QGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (QGEMM_DEFAULT_Q * 16) - 15) & ~15) +#define QGEMM_DEFAULT_R (((BUFFER_SIZE - ((QGEMM_DEFAULT_P * QGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (QGEMM_DEFAULT_Q * 16) - 15) & ~15UL) #endif #ifndef CGEMM_DEFAULT_R -#define CGEMM_DEFAULT_R (((BUFFER_SIZE - ((CGEMM_DEFAULT_P * CGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (CGEMM_DEFAULT_Q * 8) - 15) & ~15) +#define CGEMM_DEFAULT_R (((BUFFER_SIZE - ((CGEMM_DEFAULT_P * CGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (CGEMM_DEFAULT_Q * 8) - 15) & ~15UL) #endif #ifndef ZGEMM_DEFAULT_R -#define ZGEMM_DEFAULT_R (((BUFFER_SIZE - ((ZGEMM_DEFAULT_P * ZGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (ZGEMM_DEFAULT_Q * 16) - 15) & ~15) +#define ZGEMM_DEFAULT_R (((BUFFER_SIZE - ((ZGEMM_DEFAULT_P * ZGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (ZGEMM_DEFAULT_Q * 16) - 15) & ~15UL) #endif #ifndef XGEMM_DEFAULT_R -#define XGEMM_DEFAULT_R (((BUFFER_SIZE - ((XGEMM_DEFAULT_P * XGEMM_DEFAULT_Q * 32 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (XGEMM_DEFAULT_Q * 32) - 15) & ~15) +#define XGEMM_DEFAULT_R (((BUFFER_SIZE - ((XGEMM_DEFAULT_P * XGEMM_DEFAULT_Q * 32 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (XGEMM_DEFAULT_Q * 32) - 15) & ~15UL) #endif #ifndef SNUMOPT diff --git a/common_power.h b/common_power.h index e29d0f382..e0685f760 100644 --- a/common_power.h +++ b/common_power.h @@ -68,7 +68,7 @@ #endif -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #define MB __asm__ __volatile__ ("eieio":::"memory") #define WMB __asm__ __volatile__ ("eieio":::"memory") #define RMB __asm__ __volatile__ ("eieio":::"memory") @@ -105,6 +105,7 @@ static void INLINE blas_lock(volatile unsigned long *address){ " bne- 1f\n" " stwcx. %2,0, %1\n" " bne- 0b\n" + " isync\n" "1: " : "=&r"(ret) : "r"(address), "r" (val) @@ -272,7 +273,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define HAVE_PREFETCH #endif -#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || defined(PPC970) +#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || defined(POWER10) || defined(PPC970) #define DCBT_ARG 0 #else #define DCBT_ARG 8 @@ -294,7 +295,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define L1_PREFETCH dcbtst #endif -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #define L1_DUALFETCH #define L1_PREFETCHSIZE (16 + 128 * 100) #define L1_PREFETCH dcbtst @@ -843,7 +844,7 @@ Lmcount$lazy_ptr: #define BUFFER_SIZE ( 2 << 20) #elif defined(PPC440FP2) #define BUFFER_SIZE ( 16 << 20) -#elif defined(POWER8) || defined(POWER9) +#elif defined(POWER8) || defined(POWER9) || defined(POWER10) #define BUFFER_SIZE ( 64 << 20) #else #define BUFFER_SIZE ( 16 << 20) diff --git a/common_s.h b/common_s.h index 270d55e9a..685d73062 100644 --- a/common_s.h +++ b/common_s.h @@ -45,6 +45,10 @@ #define SSYMV_THREAD_U ssymv_thread_U #define SSYMV_THREAD_L ssymv_thread_L + +#define SGEMM_DIRECT_PERFORMANT sgemm_direct_performant +#define SGEMM_DIRECT sgemm_direct + #define SGEMM_ONCOPY sgemm_oncopy #define SGEMM_OTCOPY sgemm_otcopy @@ -214,6 +218,14 @@ #define SSYMV_THREAD_U ssymv_thread_U #define SSYMV_THREAD_L ssymv_thread_L +#ifdef ARCH_X86_64 +#define SGEMM_DIRECT_PERFORMANT gotoblas -> sgemm_direct_performant +#define SGEMM_DIRECT gotoblas -> sgemm_direct +#else +#define SGEMM_DIRECT_PERFORMANT sgemm_direct_performant +#define SGEMM_DIRECT sgemm_direct +#endif + #define SGEMM_ONCOPY gotoblas -> sgemm_oncopy #define SGEMM_OTCOPY gotoblas -> sgemm_otcopy #define SGEMM_INCOPY gotoblas -> sgemm_incopy diff --git a/common_thread.h b/common_thread.h index 6ec40e096..ec0c65b22 100644 --- a/common_thread.h +++ b/common_thread.h @@ -132,18 +132,18 @@ extern int blas_server_avail; static __inline int num_cpu_avail(int level) { #ifdef USE_OPENMP - int openmp_nthreads=0; + int openmp_nthreads=omp_get_max_threads(); #endif +#ifndef USE_OPENMP if (blas_cpu_number == 1 - -#ifdef USE_OPENMP - || omp_in_parallel() #endif - ) return 1; +#ifdef USE_OPENMP + if (openmp_nthreads == 1 || omp_in_parallel() +#endif + ) return 1; #ifdef USE_OPENMP - openmp_nthreads=omp_get_max_threads(); if (blas_cpu_number != openmp_nthreads) { goto_set_num_threads(openmp_nthreads); } diff --git a/common_x86_64.h b/common_x86_64.h index 0247674cd..bee7e8cdb 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -80,7 +80,7 @@ static void __inline blas_lock(volatile BLASULONG *address){ #endif do { - while (*address) {YIELDING;}; + while (*address) {YIELDING;} #ifndef C_MSVC __asm__ __volatile__( @@ -199,9 +199,9 @@ static __inline BLASLONG blas_quickdivide(BLASLONG x, BLASLONG y){ #else extern unsigned int blas_quick_divide_table[]; -static __inline int blas_quickdivide(unsigned int x, unsigned int y){ +static __inline unsigned int blas_quickdivide(unsigned int x, unsigned int y){ - unsigned int result; + volatile unsigned int result; if (y <= 1) return x; @@ -215,7 +215,6 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ y = blas_quick_divide_table[y]; __asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x) : "0" (y)); - return result; } #endif @@ -229,14 +228,8 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ #define HUGE_PAGESIZE ( 2 << 20) #ifndef BUFFERSIZE -#if defined(SKYLAKEX) -#define BUFFER_SIZE (32 << 21) -#elif defined(HASWELL) || defined(ZEN) #define BUFFER_SIZE (32 << 22) #else -#define BUFFER_SIZE (32 << 20) -#endif -#else #define BUFFER_SIZE (32 << BUFFERSIZE) #endif diff --git a/cpp_thread_test/cpp_thread_safety_common.h b/cpp_thread_test/cpp_thread_safety_common.h index 60ab5bb2f..8005369a8 100644 --- a/cpp_thread_test/cpp_thread_safety_common.h +++ b/cpp_thread_test/cpp_thread_safety_common.h @@ -5,6 +5,14 @@ inline void pauser(){ std::getline(std::cin, dummy); } +void FailIfThreadsAreZero(uint32_t numConcurrentThreads) { + if(numConcurrentThreads == 0) { + std::cout<<"ERROR: Invalid parameter 0 for number of concurrent calls into OpenBLAS!"<>& matBlock, std::mt19937_64& PRNG, std::uniform_real_distribution& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){ for(uint32_t i=0; i(randomMatSize*randomMatSize); j++){ diff --git a/cpp_thread_test/dgemm_thread_safety.cpp b/cpp_thread_test/dgemm_thread_safety.cpp index 1c5287524..104c64f2a 100644 --- a/cpp_thread_test/dgemm_thread_safety.cpp +++ b/cpp_thread_test/dgemm_thread_safety.cpp @@ -46,6 +46,8 @@ int main(int argc, char* argv[]){ std::cout<<"Number of concurrent calls into OpenBLAS : "<(randomMatSize*randomMatSize)*numConcurrentThreads*3*8)/static_cast(1024*1024)<<" MiB of RAM\n"< 4){ std::cout<<"ERROR: too many arguments for thread safety tester"<(randomMatSize*randomMatSize)*numConcurrentThreads*8)+(static_cast(randomMatSize)*numConcurrentThreads*8*2))/static_cast(1024*1024)<<" MiB of RAM\n"< #ifdef _AIX +#include #include #endif #ifdef __APPLE__ @@ -57,6 +58,7 @@ #define CPUTYPE_PPCG4 7 #define CPUTYPE_POWER8 8 #define CPUTYPE_POWER9 9 +#define CPUTYPE_POWER10 10 char *cpuname[] = { "UNKNOWN", @@ -68,7 +70,8 @@ char *cpuname[] = { "CELL", "PPCG4", "POWER8", - "POWER9" + "POWER9", + "POWER10" }; char *lowercpuname[] = { @@ -81,7 +84,8 @@ char *lowercpuname[] = { "cell", "ppcg4", "power8", - "power9" + "power9", + "power10" }; char *corename[] = { @@ -94,7 +98,8 @@ char *corename[] = { "CELL", "PPCG4", "POWER8", - "POWER9" + "POWER9", + "POWER10" }; int detect(void){ @@ -125,6 +130,7 @@ int detect(void){ if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9; + if (!strncasecmp(p, "POWER10", 7)) return CPUTYPE_POWER10; if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; @@ -132,34 +138,19 @@ int detect(void){ #endif #ifdef _AIX - FILE *infile; - char buffer[512], *p; + // Cast from int to unsigned to ensure comparisons work for all bits in + // the bit mask, even the top bit + unsigned implementation = (unsigned) _system_configuration.implementation; - p = (char *)NULL; - infile = popen("prtconf|grep 'Processor Type'", "r"); - while (fgets(buffer, sizeof(buffer), infile)){ - if (!strncmp("Pro", buffer, 3)){ - p = strchr(buffer, ':') + 2; -#if 0 - fprintf(stderr, "%s\n", p); -#endif - break; - } - } - - pclose(infile); - - if (!strncasecmp(p, "POWER3", 6)) return CPUTYPE_POWER3; - if (!strncasecmp(p, "POWER4", 6)) return CPUTYPE_POWER4; - if (!strncasecmp(p, "PPC970", 6)) return CPUTYPE_PPC970; - if (!strncasecmp(p, "POWER5", 6)) return CPUTYPE_POWER5; - if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; - if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; - if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; - if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9; - if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; - if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; - return CPUTYPE_POWER5; + if (implementation >= 0x40000u) return CPUTYPE_POWER10; + else if (implementation & 0x20000) return CPUTYPE_POWER9; + else if (implementation & 0x10000) return CPUTYPE_POWER8; + else if (implementation & 0x08000) return CPUTYPE_POWER6; // POWER 7 + else if (implementation & 0x04000) return CPUTYPE_POWER6; + else if (implementation & 0x02000) return CPUTYPE_POWER5; + else if (implementation & 0x01000) return CPUTYPE_POWER4; // MPC7450 + else if (implementation & 0x00800) return CPUTYPE_POWER4; + else return CPUTYPE_POWER3; #endif #ifdef __APPLE__ @@ -179,6 +170,9 @@ int detect(void){ int id; __asm __volatile("mfpvr %0" : "=r"(id)); switch ( id >> 16 ) { + case 0x80: // POWER10 + return CPUTYPE_POWER10; + break; case 0x4e: // POWER9 return CPUTYPE_POWER9; break; diff --git a/cpuid_x86.c b/cpuid_x86.c index e29adecae..728d459d1 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -249,6 +249,22 @@ int support_avx512(){ #endif } +int support_avx512_bf16(){ +#if !defined(NO_AVX) && !defined(NO_AVX512) + int eax, ebx, ecx, edx; + int ret=0; + + if (!support_avx512()) + return 0; + cpuid_count(7, 1, &eax, &ebx, &ecx, &edx); + if((eax & 32) == 32){ + ret=1; // CPUID.7.1:EAX[bit 5] indicates whether avx512_bf16 supported or not + } + return ret; +#else + return 0; +#endif +} int get_vendor(void){ int eax, ebx, ecx, edx; @@ -335,6 +351,7 @@ int get_cputype(int gettype){ if (support_avx()) feature |= HAVE_AVX; if (support_avx2()) feature |= HAVE_AVX2; if (support_avx512()) feature |= HAVE_AVX512VL; + if (support_avx512_bf16()) feature |= HAVE_AVX512BF16; if ((ecx & (1 << 12)) != 0) feature |= HAVE_FMA3; #endif @@ -1337,6 +1354,8 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; case 5: // Skylake X + if(support_avx512_bf16()) + return CPUTYPE_COOPERLAKE; if(support_avx512()) return CPUTYPE_SKYLAKEX; if(support_avx2()) @@ -1406,6 +1425,17 @@ int get_cpuname(void){ return CPUTYPE_SANDYBRIDGE; else return CPUTYPE_NEHALEM; + } + case 10: //family 6 exmodel 10 + switch (model) { + case 5: // Comet Lake H and S + case 6: // Comet Lake U + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; } break; } @@ -1443,10 +1473,11 @@ int get_cpuname(void){ return CPUTYPE_OPTERON; case 1: case 3: - case 7: - case 10: +// case 7: +// case 10: return CPUTYPE_BARCELONA; case 5: + case 7: return CPUTYPE_BOBCAT; case 6: switch (model) { @@ -1496,6 +1527,8 @@ int get_cpuname(void){ // AMD Ryzen case 8: // AMD Ryzen2 + default: + // Matisse/Renoir and other recent Ryzen2 if(support_avx()) #ifndef NO_AVX2 return CPUTYPE_ZEN; @@ -1505,6 +1538,16 @@ int get_cpuname(void){ else return CPUTYPE_BARCELONA; } + break; + case 10: // Zen3 + if(support_avx()) +#ifndef NO_AVX2 + return CPUTYPE_ZEN; +#else + return CPUTYPE_SANDYBRIDGE; // Zen is closer in architecture to Sandy Bridge than to Excavator +#endif + else + return CPUTYPE_BARCELONA; } break; } @@ -1653,7 +1696,8 @@ static char *cpuname[] = { "EXCAVATOR", "ZEN", "SKYLAKEX", - "DHYANA" + "DHYANA", + "COOPERLAKE" }; static char *lowercpuname[] = { @@ -1709,7 +1753,8 @@ static char *lowercpuname[] = { "excavator", "zen", "skylakex", - "dhyana" + "dhyana", + "cooperlake" }; static char *corename[] = { @@ -1742,7 +1787,8 @@ static char *corename[] = { "EXCAVATOR", "ZEN", "SKYLAKEX", - "DHYANA" + "DHYANA", + "COOPERLAKE" }; static char *corename_lower[] = { @@ -1775,7 +1821,8 @@ static char *corename_lower[] = { "excavator", "zen", "skylakex", - "dhyana" + "dhyana", + "cooperlake" }; @@ -1955,6 +2002,19 @@ int get_coretype(void){ return CORE_NEHALEM; } break; + case 10: + switch (model) { + case 5: // Comet Lake H and S + case 6: // Comet Lake U + if(support_avx()) + #ifndef NO_AVX2 + return CORE_HASWELL; + #else + return CORE_SANDYBRIDGE; + #endif + else + return CORE_NEHALEM; + } case 5: switch (model) { case 6: @@ -1970,7 +2030,9 @@ int get_coretype(void){ case 5: // Skylake X #ifndef NO_AVX512 - return CORE_SKYLAKEX; + if(support_avx512_bf16()) + return CORE_COOPERLAKE; + return CORE_SKYLAKEX; #else if(support_avx()) #ifndef NO_AVX2 @@ -2083,7 +2145,7 @@ int get_coretype(void){ return CORE_PILEDRIVER; else return CORE_BARCELONA; //OS don't support AVX. - case 5: // New EXCAVATOR + case 5: // New EXCAVATOR if(support_avx()) return CORE_EXCAVATOR; else @@ -2111,12 +2173,14 @@ int get_coretype(void){ } break; } - } else if (exfamily == 8) { + } else if (exfamily == 8 || exfamily == 10) { switch (model) { case 1: // AMD Ryzen case 8: - // Ryzen 2 + // Ryzen 2 + default: + // Matisse,Renoir Ryzen2 models if(support_avx()) #ifndef NO_AVX2 return CORE_ZEN; @@ -2237,6 +2301,7 @@ void get_cpuconfig(void){ if (features & HAVE_AVX ) printf("#define HAVE_AVX\n"); if (features & HAVE_AVX2 ) printf("#define HAVE_AVX2\n"); if (features & HAVE_AVX512VL ) printf("#define HAVE_AVX512VL\n"); + if (features & HAVE_AVX512BF16 ) printf("#define HAVE_AVX512BF16\n"); if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n"); @@ -2307,6 +2372,7 @@ void get_sse(void){ if (features & HAVE_AVX ) printf("HAVE_AVX=1\n"); if (features & HAVE_AVX2 ) printf("HAVE_AVX2=1\n"); if (features & HAVE_AVX512VL ) printf("HAVE_AVX512VL=1\n"); + if (features & HAVE_AVX512BF16 ) printf("HAVE_AVX512BF16=1\n"); if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n"); diff --git a/ctest.c b/ctest.c index 5e869b901..cd84ab1bb 100644 --- a/ctest.c +++ b/ctest.c @@ -153,3 +153,6 @@ ARCH_ARM ARCH_ARM64 #endif +#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) +HAVE_C11 +#endif diff --git a/driver/level3/Makefile b/driver/level3/Makefile index 881b4ee35..09a62d9bf 100644 --- a/driver/level3/Makefile +++ b/driver/level3/Makefile @@ -19,7 +19,10 @@ ifeq ($(ARCH), MIPS) USE_GEMM3M = 1 endif +ifeq ($(BUILD_HALF),1) SHBLASOBJS += shgemm_nn.$(SUFFIX) shgemm_nt.$(SUFFIX) shgemm_tn.$(SUFFIX) shgemm_tt.$(SUFFIX) +endif + SBLASOBJS += \ sgemm_nn.$(SUFFIX) sgemm_nt.$(SUFFIX) sgemm_tn.$(SUFFIX) sgemm_tt.$(SUFFIX) \ strmm_LNUU.$(SUFFIX) strmm_LNUN.$(SUFFIX) strmm_LNLU.$(SUFFIX) strmm_LNLN.$(SUFFIX) \ @@ -204,8 +207,9 @@ COMMONOBJS += gemm_thread_m.$(SUFFIX) gemm_thread_n.$(SUFFIX) gemm_thread_mn.$( COMMONOBJS += syrk_thread.$(SUFFIX) ifndef USE_SIMPLE_THREADED_LEVEL3 - +ifeq ($(BUILD_HALF),1) SHBLASOBJS += shgemm_thread_nn.$(SUFFIX) shgemm_thread_nt.$(SUFFIX) shgemm_thread_tn.$(SUFFIX) shgemm_thread_tt.$(SUFFIX) +endif SBLASOBJS += sgemm_thread_nn.$(SUFFIX) sgemm_thread_nt.$(SUFFIX) sgemm_thread_tn.$(SUFFIX) sgemm_thread_tt.$(SUFFIX) DBLASOBJS += dgemm_thread_nn.$(SUFFIX) dgemm_thread_nt.$(SUFFIX) dgemm_thread_tn.$(SUFFIX) dgemm_thread_tt.$(SUFFIX) QBLASOBJS += qgemm_thread_nn.$(SUFFIX) qgemm_thread_nt.$(SUFFIX) qgemm_thread_tn.$(SUFFIX) qgemm_thread_tt.$(SUFFIX) diff --git a/driver/level3/level3.c b/driver/level3/level3.c index c6bbb9ca9..a38506585 100644 --- a/driver/level3/level3.c +++ b/driver/level3/level3.c @@ -333,7 +333,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, #else for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#ifdef SKYLAKEX +#if defined(SKYLAKEX) || defined(COOPERLAKE) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else diff --git a/driver/level3/level3_gemm3m_thread.c b/driver/level3/level3_gemm3m_thread.c index 9216daaed..39824fc5a 100644 --- a/driver/level3/level3_gemm3m_thread.c +++ b/driver/level3/level3_gemm3m_thread.c @@ -91,7 +91,7 @@ #endif typedef struct { -#if __STDC_VERSION__ >= 201112L +#ifdef HAVE_C11 _Atomic #else volatile diff --git a/driver/level3/level3_syrk_threaded.c b/driver/level3/level3_syrk_threaded.c index 574f825b0..a041abac3 100644 --- a/driver/level3/level3_syrk_threaded.c +++ b/driver/level3/level3_syrk_threaded.c @@ -67,7 +67,7 @@ #endif typedef struct { -#if __STDC_VERSION__ >= 201112L +#ifdef HAVE_C11 _Atomic #else volatile diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 5a8d497d2..6e1fd9e99 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -367,7 +367,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Split local region of B into parts */ for(jjs = js; jjs < MIN(n_to, js + div_n); jjs += min_jj){ min_jj = MIN(n_to, js + div_n) - jjs; -#ifdef SKYLAKEX +#if defined(SKYLAKEX) || defined(COOPERLAKE) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else diff --git a/driver/level3/trmm_L.c b/driver/level3/trmm_L.c index 9117090b5..1027c0c73 100644 --- a/driver/level3/trmm_L.c +++ b/driver/level3/trmm_L.c @@ -135,7 +135,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#ifdef SKYLAKEX +#if defined(SKYLAKEX) || defined(COOPERLAKE) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -205,7 +205,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#ifdef SKYLAKEX +#if defined(SKYLAKEX) || defined(COOPERLAKE) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -300,7 +300,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#ifdef SKYLAKEX +#if defined(SKYLAKEX) || defined(COOPERLAKE) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -370,7 +370,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#ifdef SKYLAKEX +#if defined(SKYLAKEX) || defined(COOPERLAKE) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else diff --git a/driver/level3/trmm_R.c b/driver/level3/trmm_R.c index 62c6a2442..e8df7fb21 100644 --- a/driver/level3/trmm_R.c +++ b/driver/level3/trmm_R.c @@ -122,7 +122,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = 0; jjs < ls - js; jjs += min_jj){ min_jj = ls - js - jjs; -#ifdef SKYLAKEX +#if defined(SKYLAKEX) || defined(COOPERLAKE) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -146,7 +146,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = 0; jjs < min_l; jjs += min_jj){ min_jj = min_l - jjs; -#ifdef SKYLAKEX +#if defined(SKYLAKEX) || defined(COOPERLAKE) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -203,7 +203,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#ifdef SKYLAKEX +#if defined(SKYLAKEX) || defined(COOPERLAKE) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -258,7 +258,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = 0; jjs < min_l; jjs += min_jj){ min_jj = min_l - jjs; -#ifdef SKYLAKEX +#if defined(SKYLAKEX) || defined(COOPERLAKE) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -283,7 +283,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = 0; jjs < js - ls - min_l; jjs += min_jj){ min_jj = js - ls - min_l - jjs; -#ifdef SKYLAKEX +#if defined(SKYLAKEX) || defined(COOPERLAKE) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -344,7 +344,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#ifdef SKYLAKEX +#if defined(SKYLAKEX) || defined(COOPERLAKE) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else diff --git a/driver/others/Makefile b/driver/others/Makefile index 5653f3c25..7558ec058 100644 --- a/driver/others/Makefile +++ b/driver/others/Makefile @@ -47,8 +47,10 @@ endif endif ifdef USE_CUDA +ifeq ($(USE_CUDA), 1) COMMONOBJS += cuda_init.$(SUFFIX) endif +endif ifdef FUNCTION_PROFILE COMMONOBJS += profile.$(SUFFIX) diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index f13b83dd4..756e51b5d 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -141,7 +141,7 @@ typedef struct { } thread_status_t; -#if (__STDC_VERSION__ >= 201112L) +#ifdef HAVE_C11 #define atomic_load_queue(p) __atomic_load_n(p, __ATOMIC_RELAXED) #define atomic_store_queue(p, v) __atomic_store_n(p, v, __ATOMIC_RELAXED) #else @@ -272,7 +272,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ } } -#if defined(OS_LINUX) && !defined(NO_AFFINITY) +#if defined(OS_LINUX) && !defined(NO_AFFINITY) int gotoblas_set_affinity(int); int gotoblas_set_affinity2(int); int get_node(void); @@ -281,6 +281,8 @@ int get_node(void); static int increased_threads = 0; #ifdef OS_LINUX +extern int openblas_get_num_threads(void); + int openblas_setaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set) { const int active_threads = openblas_get_num_threads(); @@ -602,7 +604,7 @@ int blas_thread_init(void){ if(ret!=0){ struct rlimit rlim; const char *msg = strerror(ret); - fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create failed for thread %ld of %ld: %s\n", i+1,blas_num_threads,msg); + fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create failed for thread %ld of %d: %s\n", i+1,blas_num_threads,msg); #ifdef RLIMIT_NPROC if(0 == getrlimit(RLIMIT_NPROC, &rlim)) { fprintf(STDERR, "OpenBLAS blas_thread_init: RLIMIT_NPROC " diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index 4255852c8..d9969b599 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -55,7 +55,7 @@ int blas_server_avail = 0; static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER]; -#if __STDC_VERSION__ >= 201112L +#ifdef HAVE_C11 static atomic_bool blas_buffer_inuse[MAX_PARALLEL_NUMBER]; #else static _Bool blas_buffer_inuse[MAX_PARALLEL_NUMBER]; @@ -320,7 +320,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ while(true) { for(i=0; i < MAX_PARALLEL_NUMBER; i++) { -#if __STDC_VERSION__ >= 201112L +#ifdef HAVE_C11 _Bool inuse = false; if(atomic_compare_exchange_weak(&blas_buffer_inuse[i], &inuse, true)) { #else @@ -335,7 +335,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ break; } -#pragma omp parallel for schedule(OMP_SCHED) +#pragma omp parallel for num_threads(num) schedule(OMP_SCHED) for (i = 0; i < num; i ++) { #ifndef USE_SIMPLE_THREADED_LEVEL3 @@ -345,7 +345,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ exec_threads(&queue[i], buf_index); } -#if __STDC_VERSION__ >= 201112L +#ifdef HAVE_C11 atomic_store(&blas_buffer_inuse[buf_index], false); #else blas_buffer_inuse[buf_index] = false; diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 2e87e186a..5d71b1b2c 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -332,7 +332,7 @@ int support_avx512(){ if((ebx & (1<<7)) == 0){ ret=0; //OS does not even support AVX2 } - if((ebx & (1<<31)) != 0){ + if((ebx & (1u<<31)) != 0){ xgetbv(0, &eax, &edx); if((eax & 0xe0) == 0xe0) ret=1; //OS supports AVX512VL @@ -618,6 +618,18 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } + case 10: + if (model == 5 || model == 6) { + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } return NULL; } case 0xf: @@ -632,7 +644,7 @@ static gotoblas_t *get_coretype(void){ cpuid(0x80000000, &eax, &ebx, &ecx, &edx); if ( (eax & 0xffff) >= 0x01) { cpuid(0x80000001, &eax, &ebx, &ecx, &edx); - if ((edx & (1 << 30)) == 0 || (edx & (1 << 31)) == 0) + if ((edx & (1 << 30)) == 0 || (edx & (1u << 31)) == 0) return NULL; } else @@ -644,7 +656,7 @@ static gotoblas_t *get_coretype(void){ if ((exfamily == 0) || (exfamily == 2)) { if (ecx & (1 << 0)) return &gotoblas_OPTERON_SSE3; else return &gotoblas_OPTERON; - } else if (exfamily == 5) { + } else if (exfamily == 5 || exfamily == 7) { return &gotoblas_BOBCAT; } else if (exfamily == 6) { if(model == 1){ @@ -698,7 +710,7 @@ static gotoblas_t *get_coretype(void){ } } } else if (exfamily == 8) { - if (model == 1 || model == 8) { + /* if (model == 1 || model == 8) */ { if(support_avx()) return &gotoblas_ZEN; else{ @@ -706,16 +718,24 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. } } - } else if (exfamily == 9) { + } else if (exfamily == 9) { if(support_avx()) return &gotoblas_ZEN; else{ openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. - } + } + } else if (exfamily == 10) { + if(support_avx()) + return &gotoblas_ZEN; + else{ + openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); + return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. + } }else { return &gotoblas_BARCELONA; } + } } @@ -764,18 +784,53 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_NORTHWOOD) return corename[ 3]; if (gotoblas == &gotoblas_PRESCOTT) return corename[ 4]; if (gotoblas == &gotoblas_BANIAS) return corename[ 5]; - if (gotoblas == &gotoblas_ATOM) return corename[ 6]; + if (gotoblas == &gotoblas_ATOM) +#ifdef DYNAMIC_OLDER + return corename[ 6]; +#else + return corename[10]; +#endif if (gotoblas == &gotoblas_CORE2) return corename[ 7]; - if (gotoblas == &gotoblas_PENRYN) return corename[ 8]; - if (gotoblas == &gotoblas_DUNNINGTON) return corename[ 9]; + if (gotoblas == &gotoblas_PENRYN) +#ifdef DYNAMIC_OLDER + return corename[ 8]; +#else + return corename[7]; +#endif + if (gotoblas == &gotoblas_DUNNINGTON) +#ifdef DYNAMIC_OLDER + return corename[ 9]; +#else + return corename[7]; +#endif if (gotoblas == &gotoblas_NEHALEM) return corename[10]; if (gotoblas == &gotoblas_ATHLON) return corename[11]; - if (gotoblas == &gotoblas_OPTERON_SSE3) return corename[12]; - if (gotoblas == &gotoblas_OPTERON) return corename[13]; + if (gotoblas == &gotoblas_OPTERON_SSE3) +#ifdef DYNAMIC_OLDER + return corename[12]; +#else + return corename[7]; +#endif + if (gotoblas == &gotoblas_OPTERON) +#ifdef DYNAMIC_OLDER + return corename[13]; +#else + return corename[7]; +#endif if (gotoblas == &gotoblas_BARCELONA) return corename[14]; - if (gotoblas == &gotoblas_NANO) return corename[15]; + if (gotoblas == &gotoblas_NANO) +#ifdef DYNAMIC_OLDER + return corename[15]; +#else + return corename[10]; +#endif if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16]; - if (gotoblas == &gotoblas_BOBCAT) return corename[17]; + if (gotoblas == &gotoblas_BOBCAT) +#ifdef DYNAMIC_OLDER + return corename[17]; +#else + return corename[7]; +#endif if (gotoblas == &gotoblas_BULLDOZER) return corename[18]; if (gotoblas == &gotoblas_PILEDRIVER) return corename[19]; if (gotoblas == &gotoblas_HASWELL) return corename[20]; @@ -787,6 +842,7 @@ char *gotoblas_corename(void) { } + static gotoblas_t *force_coretype(char *coretype){ int i ; diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 11ef2725c..157b03365 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -53,10 +53,11 @@ extern gotoblas_t gotoblas_THUNDERX2T99; extern gotoblas_t gotoblas_TSV110; extern gotoblas_t gotoblas_EMAG8180; extern gotoblas_t gotoblas_NEOVERSEN1; +extern gotoblas_t gotoblas_THUNDERX3T110; extern void openblas_warning(int verbose, const char * msg); -#define NUM_CORETYPES 11 +#define NUM_CORETYPES 12 /* * In case asm/hwcap.h is outdated on the build system, make sure @@ -82,6 +83,7 @@ static char *corename[] = { "tsv110", "emag8180", "neoversen1", + "thunderx3t110", "unknown" }; @@ -97,6 +99,7 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_TSV110) return corename[ 8]; if (gotoblas == &gotoblas_EMAG8180) return corename[ 9]; if (gotoblas == &gotoblas_NEOVERSEN1) return corename[10]; + if (gotoblas == &gotoblas_THUNDERX3T110) return corename[11]; return corename[NUM_CORETYPES]; } @@ -127,6 +130,7 @@ static gotoblas_t *force_coretype(char *coretype) { case 8: return (&gotoblas_TSV110); case 9: return (&gotoblas_EMAG8180); case 10: return (&gotoblas_NEOVERSEN1); + case 11: return (&gotoblas_THUNDERX3T110); } snprintf(message, 128, "Core not found: %s\n", coretype); openblas_warning(1, message); @@ -190,6 +194,8 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_THUNDERX; case 0x0af: // ThunderX2 return &gotoblas_THUNDERX2T99; + case 0x0b8: // ThunderX3 + return &gotoblas_THUNDERX3T110; } break; case 0x48: // HiSilicon diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index 8c831b998..ca1d42408 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -6,6 +6,13 @@ extern gotoblas_t gotoblas_POWER8; #if (!defined __GNUC__) || ( __GNUC__ >= 6) extern gotoblas_t gotoblas_POWER9; #endif +#if (!defined __GNUC__) || ( __GNUC__ >= 11) \ + || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2) +#define HAVE_P10_SUPPORT 1 +#endif +#ifdef HAVE_P10_SUPPORT +extern gotoblas_t gotoblas_POWER10; +#endif extern void openblas_warning(int verbose, const char *msg); @@ -13,7 +20,8 @@ static char *corename[] = { "unknown", "POWER6", "POWER8", - "POWER9" + "POWER9", + "POWER10" }; #define NUM_CORETYPES 4 @@ -23,6 +31,9 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_POWER8) return corename[2]; #if (!defined __GNUC__) || ( __GNUC__ >= 6) if (gotoblas == &gotoblas_POWER9) return corename[3]; +#endif +#ifdef HAVE_P10_SUPPORT + if (gotoblas == &gotoblas_POWER10) return corename[4]; #endif return corename[0]; } @@ -36,6 +47,10 @@ static gotoblas_t *get_coretype(void) { #if (!defined __GNUC__) || ( __GNUC__ >= 6) if (__builtin_cpu_is("power9")) return &gotoblas_POWER9; +#endif +#ifdef HAVE_P10_SUPPORT + if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")) + return &gotoblas_POWER10; #endif return NULL; } @@ -61,6 +76,9 @@ static gotoblas_t *force_coretype(char * coretype) { case 2: return (&gotoblas_POWER8); #if (!defined __GNUC__) || ( __GNUC__ >= 6) case 3: return (&gotoblas_POWER9); +#endif +#ifdef HAVE_P10_SUPPORT + case 4: return (&gotoblas_POWER10); #endif default: return NULL; } diff --git a/driver/others/dynamic_zarch.c b/driver/others/dynamic_zarch.c index 90d3051b1..403b34111 100644 --- a/driver/others/dynamic_zarch.c +++ b/driver/others/dynamic_zarch.c @@ -1,12 +1,58 @@ - #include "common.h" +#include +// Gate kernels for z13 and z14 on gcc version +#if (__GNUC__ == 5 && __GNUC_MINOR__ >= 2) || __GNUC__ >= 6 || \ + /* RHEL 7 since 7.3: */ \ + (__GNUC__ == 4 && __GNUC_MINOR__ == 8 && __GNUC_PATCHLEVEL__ == 5 && \ + __GNUC_RH_RELEASE__ >= 11) +#define HAVE_Z13_SUPPORT +#endif + +#if __GNUC__ >= 7 +#define HAVE_Z14_SUPPORT +#endif + +// Guard the use of getauxval() on glibc version >= 2.16 +#ifdef __GLIBC__ +#include +#if __GLIBC_PREREQ(2, 16) +#include +#define HAVE_GETAUXVAL 1 + +static unsigned long get_hwcap(void) +{ + unsigned long hwcap = getauxval(AT_HWCAP); + char *maskenv; + + // honor requests for not using specific CPU features in LD_HWCAP_MASK + maskenv = getenv("LD_HWCAP_MASK"); + if (maskenv) + hwcap &= strtoul(maskenv, NULL, 0); + + return hwcap; + // note that a missing auxval is interpreted as no capabilities + // available, which is safe. +} + +#else // __GLIBC_PREREQ(2, 16) +#warn "Cannot detect SIMD support in Z13 or newer architectures since glibc is older than 2.16" + +static unsigned long get_hwcap(void) { + // treat missing support for getauxval() as no capabilities available, + // which is safe. + return 0; +} +#endif // __GLIBC_PREREQ(2, 16) +#endif // __GLIBC + +extern gotoblas_t gotoblas_ZARCH_GENERIC; +#ifdef HAVE_Z13_SUPPORT extern gotoblas_t gotoblas_Z13; +#endif +#ifdef HAVE_Z14_SUPPORT extern gotoblas_t gotoblas_Z14; -//extern gotoblas_t gotoblas_Z15; -//#if (!defined C_GCC) || (GCC_VERSION >= 60000) -//extern gotoblas_t gotoblas_Z14; -//#endif +#endif #define NUM_CORETYPES 4 @@ -16,47 +62,50 @@ static char* corename[] = { "unknown", "Z13", "Z14", -// "Z15", "ZARCH_GENERIC", }; char* gotoblas_corename(void) { +#ifdef HAVE_Z13_SUPPORT if (gotoblas == &gotoblas_Z13) return corename[1]; +#endif +#ifdef HAVE_Z14_SUPPORT if (gotoblas == &gotoblas_Z14) return corename[2]; -// if (gotoblas == &gotoblas_Z15) return corename[3]; -//#if (!defined C_GCC) || (GCC_VERSION >= 60000) -// if (gotoblas == &gotoblas_POWER9) return corename[3]; -//#endif - return corename[0]; // try generic? +#endif + if (gotoblas == &gotoblas_ZARCH_GENERIC) return corename[3]; + + return corename[0]; } -// __builtin_cpu_is is not supported by zarch +/** + * Detect the fitting set of kernels by retrieving the CPU features supported by + * OS from the auxiliary value AT_HWCAP and choosing the set of kernels + * ("coretype") that exploits most of the features and can be compiled with the + * available gcc version. + * Note that we cannot use vector registers on a z13 or newer unless supported + * by the OS kernel (which needs to handle them properly during context switch). + */ static gotoblas_t* get_coretype(void) { - FILE* infile; - char buffer[512], * p; - p = (char*)NULL; - infile = fopen("/proc/sysinfo", "r"); - while (fgets(buffer, sizeof(buffer), infile)) { - if (!strncmp("Type", buffer, 4)) { - p = strchr(buffer, ':') + 2; -#if 0 - fprintf(stderr, "%s\n", p); + unsigned long hwcap __attribute__((unused)) = get_hwcap(); + + // z14 and z15 systems: exploit Vector Facility (SIMD) and + // Vector-Enhancements Facility 1 (float SIMD instructions), if present. +#ifdef HAVE_Z14_SUPPORT + if ((hwcap & HWCAP_S390_VX) && (hwcap & HWCAP_S390_VXE)) + return &gotoblas_Z14; #endif - break; - } - } - fclose(infile); + // z13: Vector Facility (SIMD for double) +#ifdef HAVE_Z13_SUPPORT + if (hwcap & HWCAP_S390_VX) + return &gotoblas_Z13; +#endif - if (strstr(p, "2964")) return &gotoblas_Z13; - if (strstr(p, "2965")) return &gotoblas_Z13; - if (strstr(p, "3906")) return &gotoblas_Z14; - if (strstr(p, "3907")) return &gotoblas_Z14; - if (strstr(p, "8561")) return &gotoblas_Z14; // fallback z15 to z14 - if (strstr(p, "8562")) return &gotoblas_Z14; // fallback z15 to z14 - - return NULL; // should be ZARCH_GENERIC + // fallback in case of missing compiler support, systems before z13, or + // when the OS does not advertise support for the Vector Facility (e.g., + // missing support in the OS kernel) + return &gotoblas_ZARCH_GENERIC; } static gotoblas_t* force_coretype(char* coretype) { @@ -76,12 +125,13 @@ static gotoblas_t* force_coretype(char* coretype) { switch (found) { +#ifdef HAVE_Z13_SUPPORT case 1: return (&gotoblas_Z13); +#endif +#ifdef HAVE_Z14_SUPPORT case 2: return (&gotoblas_Z14); -// case 3: return (&gotoblas_Z15); -//#if (!defined C_GCC) || (GCC_VERSION >= 60000) -// case 3: return (&gotoblas_POWER9); -//#endif +#endif + case 3: return (&gotoblas_ZARCH_GENERIC); default: return NULL; } snprintf(message, 128, "Core not found: %s\n", coretype); @@ -109,9 +159,9 @@ void gotoblas_dynamic_init(void) { if (gotoblas == NULL) { - snprintf(coremsg, 128, "Falling back to Z14 core\n"); + snprintf(coremsg, 128, "Failed to detect system, falling back to generic z support.\n"); openblas_warning(1, coremsg); - gotoblas = &gotoblas_Z14; + gotoblas = &gotoblas_ZARCH_GENERIC; } if (gotoblas && gotoblas->init) { diff --git a/driver/others/memory.c b/driver/others/memory.c index 5abcbf3a4..9b6c226a1 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -1095,7 +1095,7 @@ static BLASULONG base_address = 0UL; static BLASULONG base_address = BASE_ADDRESS; #endif -#if __STDC_VERSION__ >= 201112L +#ifdef HAVE_C11 static _Atomic int memory_initialized = 0; #else static volatile int memory_initialized = 0; @@ -2070,7 +2070,7 @@ if (!release->address) return; if (munmap(release -> address, BUFFER_SIZE)) { int errsv=errno; perror("OpenBLAS : munmap failed:"); - printf("error code=%d,\trelease->address=%lx\n",errsv,release->address); + printf("error code=%d,\trelease->address=%p\n",errsv,release->address); } } diff --git a/driver/others/parameter.c b/driver/others/parameter.c index b1f3befae..5d312fa87 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -180,9 +180,10 @@ int get_L2_size(void){ int eax, ebx, ecx, edx; #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \ - defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ - defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \ - defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || defined(SKYLAKEX) + defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ + defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \ + defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || \ + defined(ZEN) || defined(SKYLAKEX) || defined(COOPERLAKE) cpuid(0x80000006, &eax, &ebx, &ecx, &edx); @@ -266,7 +267,9 @@ int get_L2_size(void){ void blas_set_parameter(void){ int factor; -#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || defined(SKYLAKEX) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || \ + defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || \ + defined(SKYLAKEX) || defined(COOPERLAKE) int size = 16; #else int size = get_L2_size(); diff --git a/exports/Makefile b/exports/Makefile index 60291b1ff..75901586c 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -30,6 +30,10 @@ ifndef BUILD_LAPACK_DEPRECATED BUILD_LAPACK_DEPRECATED = 0 endif +ifndef BUILD_HALF +BUILD_HALF = 0 +endif + ifeq ($(OSNAME), WINNT) ifeq ($(F_COMPILER), GFORTRAN) ifndef ONLY_CBLAS @@ -51,6 +55,10 @@ endif endif endif +ifeq ($(C_COMPILER), PGI) +EXTRALIB += -pgf90libs +endif + ifneq (,$(filter 1 2,$(NOFORTRAN))) FEXTRALIB = endif @@ -151,8 +159,12 @@ ifeq ($(F_COMPILER), INTEL) -Wl,--whole-archive $< -Wl,--no-whole-archive \ -Wl,-soname,$(INTERNALNAME) $(EXTRALIB) $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. +else ifeq ($(F_COMPILER), FLANG) + $(FC) $(FFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ + -Wl,--whole-archive $< -Wl,--no-whole-archive \ + -Wl,-soname,$(INTERNALNAME) $(EXTRALIB) + $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. else - ifneq ($(C_COMPILER), LSB) $(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ -Wl,--whole-archive $< -Wl,--no-whole-archive \ @@ -234,23 +246,23 @@ static : ../$(LIBNAME) rm -f goto.$(SUFFIX) osx.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) + perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F) aix.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) + perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F) objcopy.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) + perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F) objconv.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) + perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F) test : linktest.c $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK. rm -f linktest linktest.c : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > linktest.c + perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > linktest.c clean :: @rm -f *.def *.dylib __.SYMDEF* *.renamed diff --git a/exports/gensymbol b/exports/gensymbol index 235446f14..73b4be248 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -30,7 +30,7 @@ icamax,icamin,idamax,idamin,idmax,idmin,isamax,isamin,ismax,ismin, izamax,izamin,lsame,samax,samin,sasum,saxpy,scabs1,scamax, scamin,scasum,scnrm2,scopy,sdot,sdsdot,sgbmv,sgemm,sgemv,sger, - shgemm, smax,smin,snrm2, + smax,smin,snrm2, srot,srotg,srotm,srotmg,ssbmv,sscal,sspmv,sspr2,sspr,sswap, ssymm,ssymv,ssyr2,ssyr2k,ssyr,ssyrk,stbmv,stbsv,stpmv,stpsv, strmm,strmv,strsm,strsv,zaxpy,zcopy,zdotc,zdotu,zdrot, @@ -40,17 +40,13 @@ ztbsv,ztpmv,ztpsv,ztrmm,ztrmv,ztrsm,ztrsv, xerbla, saxpby,daxpby,caxpby,zaxpby, + somatcopy, domatcopy, comatcopy, zomatcopy, + simatcopy, dimatcopy, cimatcopy, zimatcopy, sgeadd,dgeadd,cgeadd,zgeadd, - somatcopy, - simatcopy, - domatcopy, - dimatcopy, - comatcopy, - cimatcopy, - zomatcopy, - zimatcopy, + ssum, dsum, scsum, dzsum ); +@halfblasobjs = (shgemm); @cblasobjs = ( cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv, cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k, @@ -67,7 +63,7 @@ cblas_isamax, cblas_izamax, cblas_sasum, cblas_saxpy, cblas_scasum, cblas_scnrm2, cblas_scopy, cblas_sdot, cblas_sdsdot, cblas_sgbmv, cblas_sgemm, - cblas_sgemv, cblas_sger, cblas_shgemm, cblas_snrm2, cblas_srot, cblas_srotg, + cblas_sgemv, cblas_sger, cblas_snrm2, cblas_srot, cblas_srotg, cblas_srotm, cblas_srotmg, cblas_ssbmv, cblas_sscal, cblas_sspmv, cblas_sspr2, cblas_sspr, cblas_sswap, cblas_ssymm, cblas_ssymv, cblas_ssyr2, cblas_ssyr2k, cblas_ssyr, cblas_ssyrk, cblas_stbmv, cblas_stbsv, cblas_stpmv, cblas_stpsv, cblas_strmm, cblas_strmv, cblas_strsm, @@ -80,9 +76,16 @@ cblas_saxpby,cblas_daxpby,cblas_caxpby,cblas_zaxpby, cblas_somatcopy, cblas_domatcopy, cblas_comatcopy, cblas_zomatcopy, cblas_simatcopy, cblas_dimatcopy, cblas_cimatcopy, cblas_zimatcopy, - cblas_sgeadd, cblas_dgeadd,cblas_cgeadd, cblas_zgeadd + cblas_sgeadd, cblas_dgeadd,cblas_cgeadd, cblas_zgeadd, + cblas_isamin, cblas_idamin, cblas_icamin, cblas_izamin, + cblas_ismin, cblas_idmin, cblas_icmin, cblas_izmin, + cblas_ismax, cblas_idmax, cblas_icmax, cblas_izmax, + cblas_ssum, cblas_dsum, cblas_scsum, cblas_dzsum, + cblas_xerbla ); +@halfcblasobjs = (cblas_shgemm); + @exblasobjs = ( qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm, qgemv,qger,qmax,qmin, @@ -3454,6 +3457,10 @@ use File::Spec; use File::Basename; my $dirname = File::Spec->catfile(dirname(dirname(File::Spec->rel2abs(__FILE__))), "lapack-netlib"); +if ($ARGV[12] == 1) { + @blasobjs = (@blasobjs, @halfblasobjs); + @cblasobjs = (@cblasobjs, @halfcblasobjs); +} if ($ARGV[8] == 1) { #ONLY_CBLAS=1 @underscore_objs = (@misc_underscore_objs); @@ -3494,9 +3501,12 @@ if ($ARGV[1] eq "x86") { @underscore_objs = (@underscore_objs, @gemm3mobjs); if ($ARGV[1] eq "ia64") { @underscore_objs = (@underscore_objs, @gemm3mobjs); }; if ($ARGV[1] eq "MIPS") { @underscore_objs = (@underscore_objs, @gemm3mobjs); }; - if ($ARGV[4] == 0) { @no_underscore_objs = (@cblasobjs, @misc_no_underscore_objs); + if ($ARGV[1] eq "x86_64") { @no_underscore_objs = (@no_underscore_objs, @cblasgemm3mobjs); }; + if ($ARGV[1] eq "x86") { @no_underscore_objs = (@no_underscore_objs, @cblasgemm3mobjs); }; + if ($ARGV[1] eq "ia64") { @no_underscore_objs = (@no_underscore_objs, @cblasgemm3mobjs); }; + if ($ARGV[1] eq "MIPS") { @no_underscore_objs = (@no_underscore_objs, @cblasgemm3mobjs); }; }else{ #NO_CBLAS=1 @no_underscore_objs = (@misc_no_underscore_objs); diff --git a/f_check b/f_check index fac8fc707..dd4d3475c 100644 --- a/f_check +++ b/f_check @@ -82,6 +82,9 @@ if ($compiler eq "") { if ($compiler =~ /flang/) { $vendor = FLANG; $openmp = "-fopenmp"; + } elsif ($compiler =~ /pgf/) { + $vendor = PGI; + $openmp = "-mp"; } else { $vendor = G77; $openmp = ""; @@ -334,7 +337,8 @@ if ($link ne "") { && ($flags !~ /kernel32/) && ($flags !~ /advapi32/) && ($flags !~ /shell32/) - && ($flags !~ /omp/) + && ($flags !~ /omp/ || ($vendor !~ /PGI/ && $flags =~ /omp/)) + && ($flags !~ /[0-9]+/) && ($flags !~ /^\-l$/) ) { $linker_l .= $flags . " "; diff --git a/getarch.c b/getarch.c index e739d2de9..83043bdf2 100644 --- a/getarch.c +++ b/getarch.c @@ -90,11 +90,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #endif +#if defined(AIX) +#include +#endif +#if defined(__x86_64__) || defined(_M_X64) #if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) #else #define NO_AVX512 #endif +#endif /* #define FORCE_P2 */ /* #define FORCE_KATMAI */ /* #define FORCE_COPPERMINE */ @@ -360,6 +365,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif +#ifdef FORCE_COOPERLAKE +#ifdef NO_AVX512 +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "HASWELL" +#define ARCHCONFIG "-DHASWELL " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ + "-DFMA3" +#define LIBNAME "haswell" +#define CORENAME "HASWELL" +#else +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "COOPERLAKE" +#define ARCHCONFIG "-DCOOPERLAKE " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ + "-DFMA3 -DHAVE_AVX512VL -DHAVE_AVX512BF16 -march=cooperlake" +#define LIBNAME "cooperlake" +#define CORENAME "COOPERLAKE" +#endif +#endif + #ifdef FORCE_ATOM #define FORCE #define FORCE_INTEL @@ -650,6 +685,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "POWER9" #endif +#if defined(FORCE_POWER10) +#define FORCE +#define ARCHITECTURE "POWER" +#define SUBARCHITECTURE "POWER10" +#define SUBDIRNAME "power" +#define ARCHCONFIG "-DPOWER10 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \ + "-DL2_SIZE=4194304 -DL2_LINESIZE=128 " \ + "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "power10" +#define CORENAME "POWER10" +#endif + #ifdef FORCE_PPCG4 #define FORCE #define ARCHITECTURE "POWER" @@ -1156,6 +1204,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "EMAG8180" #endif +#ifdef FORCE_THUNDERX3T110 +#define ARMV8 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "THUNDERX3T110" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DTHUNDERX3T110 " \ + "-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \ + "-DL2_SIZE=524288 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \ + "-DL3_SIZE=94371840 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" +#define LIBNAME "thunderx3t110" +#define CORENAME "THUNDERX3T110" +#else +#endif + #ifdef FORCE_ZARCH_GENERIC #define FORCE #define ARCHITECTURE "ZARCH" @@ -1284,6 +1350,11 @@ static int get_num_cores(void) { sysctl(m, 2, &count, &len, NULL, 0); return count; + +#elif defined(AIX) + //returns the number of processors which are currently online + return sysconf(_SC_NPROCESSORS_ONLN); + #else return 2; #endif @@ -1362,10 +1433,12 @@ int main(int argc, char *argv[]){ #if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ printf("__BYTE_ORDER__=__ORDER_BIG_ENDIAN__\n"); -#endif -#if defined(__BIG_ENDIAN__) && __BIG_ENDIAN__ > 0 +#elif defined(__BIG_ENDIAN__) && __BIG_ENDIAN__ > 0 printf("__BYTE_ORDER__=__ORDER_BIG_ENDIAN__\n"); #endif +#if defined(_CALL_ELF) && (_CALL_ELF == 2) +printf("ELF_VERSION=2\n"); +#endif #ifdef MAKE_NB_JOBS #if MAKE_NB_JOBS > 0 diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index 5ea39f864..7a8fc6698 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -115,7 +115,7 @@ foreach (float_type ${FLOAT_TYPES}) GenerateNamedObjects("syr2k.c" "HEMM" "her2k" ${CBLAS_FLAG} "" "" false ${float_type}) if (USE_GEMM3M) - GenerateNamedObjects("gemm.c" "GEMM3M" "gemm3m" false "" "" false ${float_type}) + GenerateNamedObjects("gemm.c" "GEMM3M" "gemm3m" ${CBLAS_FLAG} "" "" false ${float_type}) endif() endif () if (${float_type} STREQUAL "COMPLEX") diff --git a/interface/Makefile b/interface/Makefile index 741f6bac0..2dbd60073 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -46,7 +46,9 @@ SBLAS3OBJS = \ somatcopy.$(SUFFIX) simatcopy.$(SUFFIX)\ sgeadd.$(SUFFIX) +ifeq ($(BUILD_HALF),1) SHBLAS3OBJS = shgemm.$(SUFFIX) +endif DBLAS1OBJS = \ daxpy.$(SUFFIX) dswap.$(SUFFIX) \ @@ -278,7 +280,9 @@ CSBLAS3OBJS = \ cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX)\ cblas_sgeadd.$(SUFFIX) +ifeq ($(BUILD_HALF),1) CSHBLAS3OBJS = cblas_shgemm.$(SUFFIX) +endif CDBLAS1OBJS = \ cblas_idamax.$(SUFFIX) cblas_idamin.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \ @@ -363,7 +367,7 @@ CZBLAS3OBJS += cblas_zgemm3m.$(SUFFIX) endif -ifndef NO_CBLAS +ifneq ($(NO_CBLAS), 1) override CFLAGS += -I. @@ -1214,8 +1218,10 @@ zhpr2.$(SUFFIX) zhpr2.$(PSUFFIX) : zhpr2.c xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c $(CC) -c $(CFLAGS) $< -o $(@F) +ifeq ($(BUILD_HALF),1) shgemm.$(SUFFIX) shgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) +endif sgemm.$(SUFFIX) sgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) @@ -1778,8 +1784,10 @@ cblas_zhemv.$(SUFFIX) cblas_zhemv.$(PSUFFIX) : zhemv.c cblas_sgemm.$(SUFFIX) cblas_sgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) +ifeq ($(BUILD_HALF),1) cblas_shgemm.$(SUFFIX) cblas_shgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) +endif cblas_dgemm.$(SUFFIX) cblas_dgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) diff --git a/interface/gemm.c b/interface/gemm.c index 77a7c8547..68ad8945b 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -324,8 +324,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS #ifdef DYNAMIC_ARCH if (support_avx512() ) #endif - if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && sgemm_kernel_direct_performant(m,n,k)) { - sgemm_kernel_direct(m, n, k, a, lda, b, ldb, c, ldc); + if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && SGEMM_DIRECT_PERFORMANT(m,n,k)) { + SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc); return; } diff --git a/interface/swap.c b/interface/swap.c index 17a9868a9..ea40b1fc2 100644 --- a/interface/swap.c +++ b/interface/swap.c @@ -42,7 +42,7 @@ #include "functable.h" #endif -#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) +#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) || defined(THUNDERX3T110) // Multithreaded swap gives performance benefits in ThunderX2T99 #else // Disable multi-threading as it does not show any performance diff --git a/interface/zswap.c b/interface/zswap.c index 372b15447..43971b73e 100644 --- a/interface/zswap.c +++ b/interface/zswap.c @@ -42,7 +42,7 @@ #include "functable.h" #endif -#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) +#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) || defined(THUNDERX3T110) // Multithreaded swap gives performance benefits in ThunderX2T99 #else // Disable multi-threading as it does not show any performance diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 4113a1647..84dd949a4 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -127,17 +127,35 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) # Makefile.L3 set(USE_TRMM false) - if (ARM OR ARM64 OR (TARGET_CORE MATCHES LONGSOON3B) OR (TARGET_CORE MATCHES GENERIC) OR (TARGET_CORE MATCHES HASWELL) OR (TARGET_CORE MATCHES ZEN) OR (TARGET_CORE MATCHES SKYLAKEX) ) + if (ARM OR ARM64 OR (TARGET_CORE MATCHES LONGSOON3B) OR (TARGET_CORE MATCHES GENERIC) OR (TARGET_CORE MATCHES HASWELL) OR (TARGET_CORE MATCHES ZEN) OR (TARGET_CORE MATCHES SKYLAKEX) OR (TARGET_CORE MATCHES COOPERLAKE)) set(USE_TRMM true) endif () - if (ZARCH OR (TARGET_CORE MATCHES POWER8) OR (TARGET_CORE MATCHES POWER9)) + if (ZARCH OR (TARGET_CORE MATCHES POWER8) OR (TARGET_CORE MATCHES POWER9) OR (TARGET_CORE MATCHES POWER10)) set(USE_TRMM true) endif () + set(USE_DIRECT_SGEMM false) + if (X86_64) + set(USE_DIRECT_SGEMM true) + endif() + + if (USE_DIRECT_SGEMM) + # if (NOT DEFINED SGEMMDIRECTKERNEL) + set (SGEMMDIRECTKERNEL sgemm_direct_skylakex.c) + set (SGEMMDIRECTPERFORMANT sgemm_direct_performant.c) + # endif() + GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTKERNEL}" "" "gemm_direct" false "" "" false SINGLE) + GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPERFORMANT}" "" "gemm_direct_performant" false "" "" false SINGLE) + endif() + foreach (float_type SINGLE DOUBLE HALF) string(SUBSTRING ${float_type} 0 1 float_char) if (${float_type} STREQUAL "HALF") - set (float_char "SH") + if (NOT ${BUILD_HALF}) + continue () + else () + set (float_char "SH") + endif () endif () GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type}) endforeach() diff --git a/kernel/Makefile b/kernel/Makefile index 9b468a6af..16211218f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -8,8 +8,14 @@ include $(TOPDIR)/Makefile.system ifeq ($(C_COMPILER), GCC) GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) +GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10) endif +ifeq ($(ARCH), power) +ifeq ($(C_COMPILER), CLANG) + override CFLAGS += -fno-integrated-as +endif +endif AVX2OPT = ifeq ($(C_COMPILER), GCC) # AVX2 support was added in 4.7.0 @@ -32,7 +38,22 @@ ifdef NO_AVX2 endif ifdef TARGET_CORE -ifeq ($(TARGET_CORE), SKYLAKEX) +ifeq ($(TARGET_CORE), COOPERLAKE) + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) + ifeq ($(GCCVERSIONGTEQ10), 1) + override CFLAGS += -march=cooperlake + else + override CFLAGS += -march=skylake-avx512 + endif + ifeq ($(OSNAME), CYGWIN_NT) + override CFLAGS += -fno-asynchronous-unwind-tables + endif + ifeq ($(OSNAME), WINNT) + ifeq ($(C_COMPILER), GCC) + override CFLAGS += -fno-asynchronous-unwind-tables + endif + endif +else ifeq ($(TARGET_CORE), SKYLAKEX) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512 ifeq ($(OSNAME), CYGWIN_NT) override CFLAGS += -fno-asynchronous-unwind-tables diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 91d3e43a6..c58787b24 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -9,6 +9,10 @@ ifeq ($(ARCH), x86_64) USE_GEMM3M = 1 endif +ifeq ($(ARCH), x86_64) +USE_DIRECT_SGEMM = 1 +endif + ifeq ($(ARCH), ia64) USE_GEMM3M = 1 endif @@ -39,18 +43,28 @@ ifeq ($(CORE), SKYLAKEX) USE_TRMM = 1 endif +ifeq ($(CORE), COOPERLAKE) +USE_TRMM = 1 +endif + ifeq ($(CORE), ZEN) USE_TRMM = 1 endif ifeq ($(CORE), POWER8) +ifeq ($(BINARY64),1) USE_TRMM = 1 endif +endif ifeq ($(CORE), POWER9) USE_TRMM = 1 endif +ifeq ($(CORE), POWER10) +USE_TRMM = 1 +endif + ifeq ($(ARCH), zarch) USE_TRMM = 1 endif @@ -59,7 +73,15 @@ ifeq ($(CORE), Z14) USE_TRMM = 1 endif -#ifndef SHGEMMKERNEL +ifdef USE_DIRECT_SGEMM +ifndef SGEMMDIRECTKERNEL +SGEMMDIRECTKERNEL = sgemm_direct_skylakex.c +SGEMMDIRECTPERFORMANT = sgemm_direct_performant.c +endif +endif + +ifeq ($(BUILD_HALF), 1) +ifndef SHGEMMKERNEL SHGEMM_BETA = ../generic/gemm_beta.c SHGEMMKERNEL = ../generic/gemmkernel_2x2.c SHGEMMINCOPY = ../generic/gemm_ncopy_2.c @@ -70,18 +92,25 @@ SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX) SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX) SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX) SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX) -#endif +endif SHKERNELOBJS += \ shgemm_kernel$(TSUFFIX).$(SUFFIX) \ $(SHGEMMINCOPYOBJ) $(SHGEMMITCOPYOBJ) \ $(SHGEMMONCOPYOBJ) $(SHGEMMOTCOPYOBJ) +endif SKERNELOBJS += \ sgemm_kernel$(TSUFFIX).$(SUFFIX) \ $(SGEMMINCOPYOBJ) $(SGEMMITCOPYOBJ) \ $(SGEMMONCOPYOBJ) $(SGEMMOTCOPYOBJ) +ifdef USE_DIRECT_SGEMM +SKERNELOBJS += \ + sgemm_direct$(TSUFFIX).$(SUFFIX) \ + sgemm_direct_performant$(TSUFFIX).$(SUFFIX) +endif + DKERNELOBJS += \ dgemm_kernel$(TSUFFIX).$(SUFFIX) \ $(DGEMMINCOPYOBJ) $(DGEMMITCOPYOBJ) \ @@ -110,7 +139,9 @@ XKERNELOBJS += \ $(XGEMMINCOPYOBJ) $(XGEMMITCOPYOBJ) \ $(XGEMMONCOPYOBJ) $(XGEMMOTCOPYOBJ) +ifeq ($(BUILD_HALF),1) SHBLASOBJS += $(SHKERNELOBJS) +endif SBLASOBJS += $(SKERNELOBJS) DBLASOBJS += $(DKERNELOBJS) QBLASOBJS += $(QKERNELOBJS) @@ -118,7 +149,10 @@ CBLASOBJS += $(CKERNELOBJS) ZBLASOBJS += $(ZKERNELOBJS) XBLASOBJS += $(XKERNELOBJS) +ifeq ($(BUILD_HALF),1) SHBLASOBJS += shgemm_beta$(TSUFFIX).$(SUFFIX) +endif + SBLASOBJS += \ sgemm_beta$(TSUFFIX).$(SUFFIX) \ strmm_kernel_LN$(TSUFFIX).$(SUFFIX) strmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ @@ -461,11 +495,13 @@ ZBLASOBJS += \ zimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ zgeadd_k$(TSUFFIX).$(SUFFIX) - +ifeq ($(BUILD_HALF), 1) SHGEMMINCOPYOBJ_P = $(SHGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) SHGEMMITCOPYOBJ_P = $(SHGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) SHGEMMONCOPYOBJ_P = $(SHGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) SHGEMMOTCOPYOBJ_P = $(SHGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +endif + SGEMMINCOPYOBJ_P = $(SGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) SGEMMITCOPYOBJ_P = $(SGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) SGEMMONCOPYOBJ_P = $(SGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) @@ -491,8 +527,10 @@ XGEMMITCOPYOBJ_P = $(XGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) XGEMMONCOPYOBJ_P = $(XGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) XGEMMOTCOPYOBJ_P = $(XGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +ifeq ($(BUILD_HALF),1) $(KDIR)shgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_BETA) $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ +endif $(KDIR)sgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_BETA) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ @@ -512,12 +550,16 @@ $(KDIR)zgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA) $(KDIR)xgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMM_BETA) $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@ + +ifeq ($(BUILD_HALF), 1) + $(KDIR)$(SHGEMMONCOPYOBJ) : $(KERNELDIR)/$(SHGEMMONCOPY) $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(SHGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SHGEMMOTCOPY) + ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DHALF -UDOUBLE -UCOMPLEX $< -o shgemmotcopy.s + $(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX $< -o - > shgemmotcopy.s m4 shgemmotcopy.s > shgemmotcopy_nomacros.s $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemmotcopy_nomacros.s -o $@ rm shgemmotcopy.s shgemmotcopy_nomacros.s @@ -532,7 +574,7 @@ $(KDIR)$(SHGEMMINCOPYOBJ) : $(KERNELDIR)/$(SHGEMMINCOPY) $(KDIR)$(SHGEMMITCOPYOBJ) : $(KERNELDIR)/$(SHGEMMITCOPY) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DHALF -UDOUBLE -UCOMPLEX $< -o shgemmitcopy.s + $(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX $< -o - > shgemmitcopy.s m4 shgemmitcopy.s > shgemmitcopy_nomacros.s $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemmitcopy_nomacros.s -o $@ rm shgemmitcopy.s shgemmitcopy_nomacros.s @@ -540,6 +582,7 @@ else $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ endif +endif endif $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY) @@ -547,7 +590,7 @@ $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY) $(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmotcopy.s + $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmotcopy.s m4 sgemmotcopy.s > sgemmotcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmotcopy_nomacros.s -o $@ rm sgemmotcopy.s sgemmotcopy_nomacros.s @@ -563,7 +606,7 @@ $(KDIR)$(SGEMMINCOPYOBJ) : $(KERNELDIR)/$(SGEMMINCOPY) $(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmitcopy.s + $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmitcopy.s m4 sgemmitcopy.s > sgemmitcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmitcopy_nomacros.s -o $@ rm sgemmitcopy.s sgemmitcopy_nomacros.s @@ -575,7 +618,7 @@ endif $(KDIR)$(DGEMMONCOPYOBJ) : $(KERNELDIR)/$(DGEMMONCOPY) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_ncopy.s + $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_ncopy.s m4 dgemm_ncopy.s > dgemm_ncopy_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_ncopy_nomacros.s -o $@ rm dgemm_ncopy.s dgemm_ncopy_nomacros.s @@ -593,7 +636,7 @@ $(KDIR)$(DGEMMINCOPYOBJ) : $(KERNELDIR)/$(DGEMMINCOPY) $(KDIR)$(DGEMMITCOPYOBJ) : $(KERNELDIR)/$(DGEMMITCOPY) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_itcopy.s + $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_itcopy.s m4 dgemm_itcopy.s > dgemm_itcopy_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_itcopy_nomacros.s -o $@ rm dgemm_itcopy.s dgemm_itcopy_nomacros.s @@ -636,7 +679,7 @@ $(KDIR)$(CGEMMINCOPYOBJ) : $(KERNELDIR)/$(CGEMMINCOPY) $(KDIR)$(CGEMMITCOPYOBJ) : $(KERNELDIR)/$(CGEMMITCOPY) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -E $< -o cgemm_itcopy.s + $(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -S $< -o - > cgemm_itcopy.s m4 cgemm_itcopy.s > cgemm_itcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX cgemm_itcopy_nomacros.s -o $@ rm cgemm_itcopy.s cgemm_itcopy_nomacros.s @@ -659,7 +702,7 @@ $(KDIR)$(ZGEMMINCOPYOBJ) : $(KERNELDIR)/$(ZGEMMINCOPY) $(KDIR)$(ZGEMMITCOPYOBJ) : $(KERNELDIR)/$(ZGEMMITCOPY) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o zgemm_itcopy.s + $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > zgemm_itcopy.s m4 zgemm_itcopy.s > zgemm_itcopy_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX zgemm_itcopy_nomacros.s -o $@ rm zgemm_itcopy.s zgemm_itcopy_nomacros.s @@ -691,7 +734,7 @@ endif $(KDIR)sgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemm_kernel$(TSUFFIX).s + $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemm_kernel$(TSUFFIX).s m4 sgemm_kernel$(TSUFFIX).s > sgemm_kernel$(TSUFFIX)_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemm_kernel$(TSUFFIX)_nomacros.s -o $@ rm sgemm_kernel$(TSUFFIX).s sgemm_kernel$(TSUFFIX)_nomacros.s @@ -699,19 +742,29 @@ else $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ endif +ifdef USE_DIRECT_SGEMM +$(KDIR)sgemm_direct_performant$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTPERFORMANT) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ +$(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ +endif + +ifeq ($(BUILD_HALF), 1) + $(KDIR)shgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DHALF -UDOUBLE -UCOMPLEX $< -o shgemm_kernel$(TSUFFIX).s + $(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX $< -o - > shgemm_kernel$(TSUFFIX).s m4 shgemm_kernel$(TSUFFIX).s > shgemm_kernel$(TSUFFIX)_nomacros.s $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemm_kernel$(TSUFFIX)_nomacros.s -o $@ rm shgemm_kernel$(TSUFFIX).s shgemm_kernel$(TSUFFIX)_nomacros.s else $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ endif +endif $(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_kernel$(TSUFFIX).s + $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_kernel$(TSUFFIX).s m4 dgemm_kernel$(TSUFFIX).s > dgemm_kernel$(TSUFFIX)_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_kernel$(TSUFFIX)_nomacros.s -o $@ rm dgemm_kernel$(TSUFFIX).s dgemm_kernel$(TSUFFIX)_nomacros.s @@ -724,7 +777,7 @@ $(KDIR)qgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEP $(KDIR)cgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNN $< -o cgemm_kernel_n.s + $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DNN $< -o - > cgemm_kernel_n.s m4 cgemm_kernel_n.s > cgemm_kernel_n_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN cgemm_kernel_n_nomacros.s -o $@ rm cgemm_kernel_n.s cgemm_kernel_n_nomacros.s @@ -734,7 +787,7 @@ endif $(KDIR)cgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCN $< -o cgemm_kernel_l.s + $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DCN $< -o - > cgemm_kernel_l.s m4 cgemm_kernel_l.s > cgemm_kernel_l_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN cgemm_kernel_l_nomacros.s -o $@ rm cgemm_kernel_l.s cgemm_kernel_l_nomacros.s @@ -744,7 +797,7 @@ endif $(KDIR)cgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s + $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DNC $< -o - > cgemm_kernel_r.s m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@ rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s @@ -754,7 +807,7 @@ endif $(KDIR)cgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCC $< -o cgemm_kernel_b.s + $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DCC $< -o - > cgemm_kernel_b.s m4 cgemm_kernel_b.s > cgemm_kernel_b_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC cgemm_kernel_b_nomacros.s -o $@ rm cgemm_kernel_b.s cgemm_kernel_b_nomacros.s @@ -764,7 +817,7 @@ endif $(KDIR)zgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNN $< -o zgemm_kernel_n.s + $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DNN $< -o - > zgemm_kernel_n.s m4 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@ rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s @@ -774,7 +827,7 @@ endif $(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCN $< -o zgemm_kernel_l.s + $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DCN $< -o - > zgemm_kernel_l.s m4 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@ rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s @@ -784,7 +837,7 @@ endif $(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNC $< -o zgemm_kernel_r.s + $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DNC $< -o - > zgemm_kernel_r.s m4 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@ rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s @@ -794,7 +847,7 @@ endif $(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCC $< -o zgemm_kernel_b.s + $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DCC $< -o - > zgemm_kernel_b.s m4 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@ rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s @@ -818,7 +871,7 @@ $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD ifdef USE_TRMM $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o strmmkernel_ln.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o - > strmmkernel_ln.s m4 strmmkernel_ln.s > strmmkernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA strmmkernel_ln_nomacros.s -o $@ rm strmmkernel_ln.s strmmkernel_ln_nomacros.s @@ -828,7 +881,7 @@ endif $(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o strmmkernel_lt.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o - > strmmkernel_lt.s m4 strmmkernel_lt.s > strmmkernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA strmmkernel_lt_nomacros.s -o $@ rm strmmkernel_lt.s strmmkernel_lt_nomacros.s @@ -838,7 +891,7 @@ endif $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o strmmkernel_rn.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o - > strmmkernel_rn.s m4 strmmkernel_rn.s > strmmkernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA strmmkernel_rn_nomacros.s -o $@ rm strmmkernel_rn.s strmmkernel_rn_nomacros.s @@ -848,7 +901,7 @@ endif $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s @@ -858,7 +911,7 @@ endif $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o dtrmm_kernel_ln.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o - > dtrmm_kernel_ln.s m4 dtrmm_kernel_ln.s > dtrmm_kernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA dtrmm_kernel_ln_nomacros.s -o $@ rm dtrmm_kernel_ln.s dtrmm_kernel_ln_nomacros.s @@ -868,7 +921,7 @@ endif $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o dtrmm_kernel_lt.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o - > dtrmm_kernel_lt.s m4 dtrmm_kernel_lt.s > dtrmm_kernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA dtrmm_kernel_lt_nomacros.s -o $@ rm dtrmm_kernel_lt.s dtrmm_kernel_lt_nomacros.s @@ -878,7 +931,7 @@ endif $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o dtrmm_kernel_rn.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o - > dtrmm_kernel_rn.s m4 dtrmm_kernel_rn.s > dtrmm_kernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA dtrmm_kernel_rn_nomacros.s -o $@ rm dtrmm_kernel_rn.s dtrmm_kernel_rn_nomacros.s @@ -888,7 +941,7 @@ endif $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o dtrmm_kernel_rt.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > dtrmm_kernel_rt.s m4 dtrmm_kernel_rt.s > dtrmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA dtrmm_kernel_rt_nomacros.s -o $@ rm dtrmm_kernel_rt.s dtrmm_kernel_rt_nomacros.s @@ -910,7 +963,7 @@ $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_ln.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_ln.s m4 ctrmm_kernel_ln.s > ctrmm_kernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_ln_nomacros.s -o $@ rm ctrmm_kernel_ln.s ctrmm_kernel_ln_nomacros.s @@ -920,7 +973,7 @@ endif $(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_lt.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_lt.s m4 ctrmm_kernel_lt.s > ctrmm_kernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_lt_nomacros.s -o $@ rm ctrmm_kernel_lt.s ctrmm_kernel_lt_nomacros.s @@ -930,7 +983,7 @@ endif $(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lr.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o - > ctrmm_kernel_lr.s m4 ctrmm_kernel_lr.s > ctrmm_kernel_lr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ctrmm_kernel_lr_nomacros.s -o $@ rm ctrmm_kernel_lr.s ctrmm_kernel_lr_nomacros.s @@ -940,7 +993,7 @@ endif $(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lc.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o - > ctrmm_kernel_lc.s m4 ctrmm_kernel_lc.s > ctrmm_kernel_lc_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ctrmm_kernel_lc_nomacros.s -o $@ rm ctrmm_kernel_lc_nomacros.s ctrmm_kernel_lc.s @@ -950,7 +1003,7 @@ endif $(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rn.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_rn.s m4 ctrmm_kernel_rn.s > ctrmm_kernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_rn_nomacros.s -o $@ rm ctrmm_kernel_rn.s ctrmm_kernel_rn_nomacros.s @@ -960,7 +1013,7 @@ endif $(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rt.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_rt.s m4 ctrmm_kernel_rt.s > ctrmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_rt_nomacros.s -o $@ rm ctrmm_kernel_rt.s ctrmm_kernel_rt_nomacros.s @@ -970,7 +1023,7 @@ endif $(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ctrmm_kernel_rr.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o - > ctrmm_kernel_rr.s m4 ctrmm_kernel_rr.s > ctrmm_kernel_rr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ctrmm_kernel_rr_nomacros.s -o $@ rm ctrmm_kernel_rr.s ctrmm_kernel_rr_nomacros.s @@ -980,7 +1033,7 @@ endif $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ctrmm_kernel_RC.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o - > ctrmm_kernel_RC.s m4 ctrmm_kernel_RC.s > ctrmm_kernel_RC_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ctrmm_kernel_RC_nomacros.s -o $@ rm ctrmm_kernel_RC.s ctrmm_kernel_RC_nomacros.s @@ -990,7 +1043,7 @@ endif $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_ln.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_ln.s m4 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@ rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s @@ -1000,7 +1053,7 @@ endif $(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_lt.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_lt.s m4 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@ rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s @@ -1010,7 +1063,7 @@ endif $(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lr.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o - > ztrmm_kernel_lr.s m4 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@ rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s @@ -1020,7 +1073,7 @@ endif $(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lc.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o - > ztrmm_kernel_lc.s m4 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@ rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s @@ -1030,7 +1083,7 @@ endif $(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rn.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_rn.s m4 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@ rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s @@ -1040,7 +1093,7 @@ endif $(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rt.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_rt.s m4 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@ rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s @@ -1050,7 +1103,7 @@ endif $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rr.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o - > ztrmm_kernel_rr.s m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@ rm ztrmm_kernel_rr.s ztrmm_kernel_rr_nomacros.s @@ -1060,7 +1113,7 @@ endif $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rc.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o - > ztrmm_kernel_rc.s m4 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@ rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s @@ -1080,7 +1133,7 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s @@ -1214,7 +1267,7 @@ $(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DT $(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o dtrsm_kernel_lt.s + $(CC) $(CFLAGS) -S -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o - > dtrsm_kernel_lt.s m4 dtrsm_kernel_lt.s > dtrsm_kernel_lt_nomacros.s $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ dtrsm_kernel_lt_nomacros.s -o $@ rm dtrsm_kernel_lt.s dtrsm_kernel_lt_nomacros.s @@ -2325,8 +2378,10 @@ $(KDIR)xtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_ $(KDIR)sgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMM_BETA) $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ +ifeq ($(BUILD_HALF),1) $(KDIR)shgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHGEMM_BETA) $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ +endif $(KDIR)dgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMM_BETA) $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ @@ -2343,6 +2398,8 @@ $(KDIR)zgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA) $(KDIR)xgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMM_BETA) $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@ + +ifeq ($(BUILD_HALF), 1) $(SHGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMONCOPY) $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ @@ -2357,6 +2414,8 @@ $(SHGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMITCOPY) $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ endif +endif + $(SGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMONCOPY) $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ @@ -2373,7 +2432,7 @@ $(SGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMITCOPY) endif -$(D cgemm_kernel_r.s m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@ rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s @@ -2527,7 +2589,7 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s m4 strmmkernel_rn.s > strmm_kernel_rt_nomacros.s $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s diff --git a/kernel/arm/zdot.c b/kernel/arm/zdot.c index 733c235c6..ba0e57eb5 100644 --- a/kernel/arm/zdot.c +++ b/kernel/arm/zdot.c @@ -48,10 +48,12 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA dot[0]=0.0; dot[1]=0.0; - +#if !defined(__PPC__) CREAL(result) = 0.0 ; CIMAG(result) = 0.0 ; - +#else + result = OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0); +#endif if ( n < 1 ) return(result); inc_x2 = 2 * inc_x ; @@ -71,8 +73,12 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA i++ ; } - CREAL(result) = dot[0]; +#if !defined(__POWER__) + CREAL(result) = dot[0]; CIMAG(result) = dot[1]; +#else + result = OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0],dot[1]); +#endif return(result); } diff --git a/kernel/arm64/KERNEL.CORTEXA53 b/kernel/arm64/KERNEL.CORTEXA53 index c1d33fa3e..eba38a92e 100644 --- a/kernel/arm64/KERNEL.CORTEXA53 +++ b/kernel/arm64/KERNEL.CORTEXA53 @@ -1,3 +1,187 @@ -include $(KERNELDIR)/KERNEL.ARMV8 +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = axpy.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S +SASUMKERNEL = asum.S +DASUMKERNEL = asum.S +CASUMKERNEL = casum.S +ZASUMKERNEL = zasum.S + +SCOPYKERNEL = copy.S +DCOPYKERNEL = copy.S +CCOPYKERNEL = copy.S +ZCOPYKERNEL = copy.S + +SSWAPKERNEL = swap.S +DSWAPKERNEL = swap.S +CSWAPKERNEL = swap.S +ZSWAPKERNEL = swap.S + +ISAMAXKERNEL = iamax.S +IDAMAXKERNEL = iamax.S +ICAMAXKERNEL = izamax.S +IZAMAXKERNEL = izamax.S + +SNRM2KERNEL = nrm2.S +DNRM2KERNEL = nrm2.S +CNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S + +DDOTKERNEL = dot.S +SDOTKERNEL = dot.S +CDOTKERNEL = zdot.S +ZDOTKERNEL = zdot.S +DSDOTKERNEL = dot.S + +DGEMM_BETA = dgemm_beta.S +SGEMM_BETA = sgemm_beta.S + +ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8) +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S +else +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +endif +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +ifeq ($(SGEMM_UNROLL_M), 16) +SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S +else +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +endif +ifeq ($(SGEMM_UNROLL_M), 4) +SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S +else +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +endif +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S +SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) + +ifeq ($(DGEMM_UNROLL_M), 8) +DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S +DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S +else +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +endif + +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +ifeq ($(DGEMM_UNROLL_N), 4) +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S +else +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +endif + +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/arm64/KERNEL.THUNDERX3T110 b/kernel/arm64/KERNEL.THUNDERX3T110 new file mode 100644 index 000000000..a20d0d4a6 --- /dev/null +++ b/kernel/arm64/KERNEL.THUNDERX3T110 @@ -0,0 +1,184 @@ +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = daxpy_thunderx2t99.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) + +ifeq ($(DGEMM_UNROLL_M), 8) +DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S +DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S +else +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +endif + +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +ifeq ($(DGEMM_UNROLL_N), 4) +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S +else +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +endif + +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +SASUMKERNEL = sasum_thunderx2t99.c +DASUMKERNEL = dasum_thunderx2t99.c +CASUMKERNEL = casum_thunderx2t99.c +ZASUMKERNEL = zasum_thunderx2t99.c + +SCOPYKERNEL = copy_thunderx2t99.c +DCOPYKERNEL = copy_thunderx2t99.c +CCOPYKERNEL = copy_thunderx2t99.c +ZCOPYKERNEL = copy_thunderx2t99.c + +SSWAPKERNEL = swap_thunderx2t99.S +DSWAPKERNEL = swap_thunderx2t99.S +CSWAPKERNEL = swap_thunderx2t99.S +ZSWAPKERNEL = swap_thunderx2t99.S + +ISAMAXKERNEL = iamax_thunderx2t99.c +IDAMAXKERNEL = iamax_thunderx2t99.c +ICAMAXKERNEL = izamax_thunderx2t99.c +IZAMAXKERNEL = izamax_thunderx2t99.c + +SNRM2KERNEL = scnrm2_thunderx2t99.c +CNRM2KERNEL = scnrm2_thunderx2t99.c +#DNRM2KERNEL = dznrm2_thunderx2t99_fast.c +#ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c +DNRM2KERNEL = dznrm2_thunderx2t99.c +ZNRM2KERNEL = dznrm2_thunderx2t99.c + + +DDOTKERNEL = dot_thunderx2t99.c +SDOTKERNEL = dot_thunderx2t99.c +CDOTKERNEL = zdot_thunderx2t99.c +ZDOTKERNEL = zdot_thunderx2t99.c +DSDOTKERNEL = dot.S + +ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4) +DGEMMKERNEL = dgemm_kernel_8x4_thunderx2t99.S +endif + +ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 16x4) +SGEMMKERNEL = sgemm_kernel_16x4_thunderx2t99.S +endif + +ifeq ($(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N), 8x4) +CGEMMKERNEL = cgemm_kernel_8x4_thunderx2t99.S +endif + +ifeq ($(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N), 4x4) +ZGEMMKERNEL = zgemm_kernel_4x4_thunderx2t99.S +endif diff --git a/kernel/arm64/daxpy_thunderx2t99.S b/kernel/arm64/daxpy_thunderx2t99.S index b8d0af5c2..baf39150f 100644 --- a/kernel/arm64/daxpy_thunderx2t99.S +++ b/kernel/arm64/daxpy_thunderx2t99.S @@ -98,11 +98,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add X, X, #128 .endm +/* + * No need to do software prefetches if the vector fits + * into L1 cache + */ +.macro KERNEL_F16_L1CACHE + ldp q4, q5, [X] + ldp q16, q17, [Y] + + ldp q6, q7, [X, #32] + ldp q18, q19, [Y, #32] + + fmla v16.2d, v4.2d, v0.d[0] + fmla v17.2d, v5.2d, v0.d[0] + + stp q16, q17, [Y] + + ldp q20, q21, [X, #64] + ldp q24, q25, [Y, #64] + + fmla v18.2d, v6.2d, v0.d[0] + fmla v19.2d, v7.2d, v0.d[0] + + stp q18, q19, [Y, #32] + + ldp q22, q23, [X, #96] + ldp q26, q27, [Y, #96] + + fmla v24.2d, v20.2d, v0.d[0] + fmla v25.2d, v21.2d, v0.d[0] + + stp q24, q25, [Y, #64] + + fmla v26.2d, v22.2d, v0.d[0] + fmla v27.2d, v23.2d, v0.d[0] + + stp q26, q27, [Y, #96] + + add Y, Y, #128 + add X, X, #128 +.endm + .macro KERNEL_F32 KERNEL_F16 KERNEL_F16 .endm + +.macro KERNEL_F32_L1CACHE + KERNEL_F16_L1CACHE + KERNEL_F16_L1CACHE +.endm + .macro INIT_S lsl INC_X, INC_X, #3 lsl INC_Y, INC_Y, #3 @@ -138,6 +185,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp I, xzr beq .Ldaxpy_kernel_F1 + cmp N, #2048 + ble .Ldaxpy_kernel_F32_L1CACHE + .align 5 .Ldaxpy_kernel_F32: @@ -145,6 +195,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. subs I, I, #1 bne .Ldaxpy_kernel_F32 + b .Ldaxpy_kernel_F1 + + .align 5 +.Ldaxpy_kernel_F32_L1CACHE: + + KERNEL_F32_L1CACHE + + subs I, I, #1 + bne .Ldaxpy_kernel_F32_L1CACHE .Ldaxpy_kernel_F1: diff --git a/kernel/arm64/sgemm_kernel_8x8_cortexa53.S b/kernel/arm64/sgemm_kernel_8x8_cortexa53.S new file mode 100644 index 000000000..628a928ca --- /dev/null +++ b/kernel/arm64/sgemm_kernel_8x8_cortexa53.S @@ -0,0 +1,2299 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc) */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define offset x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pA x15 +#define temp x16 + +#define alpha0 s10 +#define alphaV0 v10.s[0] +#define alpha1 s11 +#define alphaV1 v11.s[0] +#define alpha2 s14 +#define alphaV2 v14.s[0] +#define alpha3 s15 +#define alphaV3 v15.s[0] + +#define A_PRE_SIZE 640 +#define B_PRE_SIZE 224 +#define C_PRE_SIZE 96 + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pA +// 16 temp +// 17 +// 18 must save +// 19 must save +// 20 must save pA0_2, pA0_3 +// 21 must save pA0_6, pA0_7 +// 22 must save pA1_2, pA1_3 +// 23 must save pA1_6, pA1_7 +// 24 must save pB0_2, pB0_3 +// 25 must save pB0_6, pB0_7 +// 26 must save pB1_2, pB1_3 +// 27 must save pB1_6, pB1_7 +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA0_0, pA0_1, pA0_2, pA0_3 +//v01 pA0_4, pA0_5, pA0_6, pA0_7 +//v02 pA1_0, pA1_1, pA1_2, pA1_3 +//v03 pA1_4, pA1_5, pA1_6, pA1_7 +//v04 pB0_0, pB0_1, pB0_2, pB0_3 +//v05 pB0_4, pB0_5, pB0_6, pB0_7 +//v06 pB1_0, pB1_1, pB1_2, pB1_3 +//v07 pB1_4, pB1_5, pB1_6, pB1_7 +//v08 must save +//v09 must save +//v10 must save ALPHA0 +//v11 must save ALPHA1 +//v12 must save +//v13 must save +//v14 must save ALPHA2 +//v15 must save ALPHA3 +//v16 must save C00, C01, C02, C03 +//v17 must save C04, C05, C06, C07 +//v18 C08, C09, C10, C11 +//v19 C12, C13, C14, C15 +//v20 C16, C17, C18, C19 +//v21 C20, C21, C22, C23 +//v22 C24, C25, C26, C27 +//v23 C28, C29, C30, C31 +//v24 C32, C33, C34, C35 +//v25 C36, C37, C38, C39 +//v26 C40, C41, C42, C43 +//v27 C44, C45, C46, C47 +//v28 C48, C49, C50, C51 +//v29 C52, C53, C54, C55 +//v30 C56, C57, C58, C59 +//v31 C60, C61, C62, C63 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INIT8x8 + fmov s16, wzr + fmov s17, wzr + fmov s18, s16 + fmov s19, s17 + fmov s20, wzr + fmov s21, s16 + fmov s22, s17 + fmov s23, s18 + fmov s24, wzr + fmov s25, s16 + fmov s26, s17 + fmov s27, s18 + fmov s28, wzr + fmov s29, s16 + fmov s30, s17 + fmov s31, s18 +.endm + +.macro KERNEL8x8_I + ldp q0, q1, [pA], #32 + ldp q4, q5, [pB], #32 + + ldr d2, [pA], #8 + ldr d6, [pB], #8 + ldr d3, [pA, #8] + ldr d7, [pB, #8] + ldr x22, [pA], #16 + fmul v16.4s, v0.4s, v4.s[0] + ldr x26, [pB], #16 + fmul v17.4s, v1.4s, v4.s[0] + ldr x23, [pA], #8 + fmul v18.4s, v0.4s, v4.s[1] + ldr x27, [pB], #8 + fmul v19.4s, v1.4s, v4.s[1] + fmul v20.4s, v0.4s, v4.s[2] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmul v21.4s, v1.4s, v4.s[2] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmul v22.4s, v0.4s, v4.s[3] + fmul v23.4s, v1.4s, v4.s[3] + fmul v24.4s, v0.4s, v5.s[0] + fmul v25.4s, v1.4s, v5.s[0] + fmul v26.4s, v0.4s, v5.s[1] + fmul v27.4s, v1.4s, v5.s[1] + fmul v28.4s, v0.4s, v5.s[2] + fmul v29.4s, v1.4s, v5.s[2] + fmul v30.4s, v0.4s, v5.s[3] + fmul v31.4s, v1.4s, v5.s[3] +.endm + +.macro KERNEL8x8_M1 + ldr d2, [pA], #8 + fmov v0.d[1], x20 + ldr d6, [pB], #8 + fmov v4.d[1], x24 + ldr d3, [pA, #8] + fmov v1.d[1], x21 + ldr d7, [pB, #8] + fmov v5.d[1], x25 + fmla v16.4s, v0.4s, v4.s[0] + ldr x22, [pA], #16 + fmla v17.4s, v1.4s, v4.s[0] + ldr x26, [pB], #16 + fmla v18.4s, v0.4s, v4.s[1] + ldr x23, [pA], #8 + fmla v19.4s, v1.4s, v4.s[1] + ldr x27, [pB], #8 + fmla v20.4s, v0.4s, v4.s[2] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla v21.4s, v1.4s, v4.s[2] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla v22.4s, v0.4s, v4.s[3] + fmla v23.4s, v1.4s, v4.s[3] + fmla v24.4s, v0.4s, v5.s[0] + fmla v25.4s, v1.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v27.4s, v1.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v29.4s, v1.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] + fmla v31.4s, v1.4s, v5.s[3] +.endm + +.macro KERNEL8x8_M2 + ldr d0, [pA], #8 + fmov v2.d[1], x22 + ldr d4, [pB], #8 + fmov v6.d[1], x26 + ldr d1, [pA, #8] + fmov v3.d[1], x23 + ldr d5, [pB, #8] + fmov v7.d[1], x27 + fmla v16.4s, v2.4s, v6.s[0] + ldr x20, [pA], #16 + fmla v17.4s, v3.4s, v6.s[0] + ldr x24, [pB], #16 + fmla v18.4s, v2.4s, v6.s[1] + ldr x21, [pA], #8 + fmla v19.4s, v3.4s, v6.s[1] + ldr x25, [pB], #8 + fmla v20.4s, v2.4s, v6.s[2] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla v21.4s, v3.4s, v6.s[2] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla v22.4s, v2.4s, v6.s[3] + fmla v23.4s, v3.4s, v6.s[3] + fmla v24.4s, v2.4s, v7.s[0] + fmla v25.4s, v3.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v27.4s, v3.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v29.4s, v3.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] + fmla v31.4s, v3.4s, v7.s[3] +.endm + +.macro KERNEL8x8_E + fmov v2.d[1], x22 + fmov v6.d[1], x26 + fmov v3.d[1], x23 + fmov v7.d[1], x27 + fmla v16.4s, v2.4s, v6.s[0] + fmla v17.4s, v3.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v19.4s, v3.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla v21.4s, v3.4s, v6.s[2] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla v22.4s, v2.4s, v6.s[3] + fmla v23.4s, v3.4s, v6.s[3] + fmla v24.4s, v2.4s, v7.s[0] + fmla v25.4s, v3.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v27.4s, v3.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v29.4s, v3.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] + fmla v31.4s, v3.4s, v7.s[3] +.endm + +.macro KERNEL8x8_SUB + ldp q0, q1, [pA], #32 + ldp q4, q5, [pB], #32 + + fmla v16.4s, v0.4s, v4.s[0] + fmla v17.4s, v1.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v19.4s, v1.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla v21.4s, v1.4s, v4.s[2] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla v22.4s, v0.4s, v4.s[3] + fmla v23.4s, v1.4s, v4.s[3] + fmla v24.4s, v0.4s, v5.s[0] + fmla v25.4s, v1.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v27.4s, v1.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v29.4s, v1.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] + fmla v31.4s, v1.4s, v5.s[3] +.endm + +.macro SAVE8x8 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + add pCRow1, pCRow0, LDC + + ldp q0, q1, [pCRow0] + fmla v0.4s, v16.4s, alphaV0 + fmla v1.4s, v17.4s, alphaV1 + stp q0, q1, [pCRow0] + + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + add pCRow2, pCRow1, LDC + + ldp q2, q3, [pCRow1] + fmla v2.4s, v18.4s, alphaV2 + fmla v3.4s, v19.4s, alphaV3 + stp q2, q3, [pCRow1] + + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + add pCRow1, pCRow2, LDC + + ldp q4, q5, [pCRow2] + fmla v4.4s, v20.4s, alphaV0 + fmla v5.4s, v21.4s, alphaV1 + stp q4, q5, [pCRow2] + + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + add pCRow2, pCRow1, LDC + + ldp q6, q7, [pCRow1] + fmla v6.4s, v22.4s, alphaV2 + fmla v7.4s, v23.4s, alphaV3 + stp q6, q7, [pCRow1] + + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + add pCRow1, pCRow2, LDC + + ldp q0, q1, [pCRow2] + fmla v0.4s, v24.4s, alphaV0 + fmla v1.4s, v25.4s, alphaV1 + stp q0, q1, [pCRow2] + + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + add pCRow2, pCRow1, LDC + + ldp q2, q3, [pCRow1] + fmla v2.4s, v26.4s, alphaV2 + fmla v3.4s, v27.4s, alphaV3 + stp q2, q3, [pCRow1] + + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + add pCRow1, pCRow2, LDC + + ldp q4, q5, [pCRow2] + fmla v4.4s, v28.4s, alphaV0 + fmla v5.4s, v29.4s, alphaV1 + stp q4, q5, [pCRow2] + + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ldp q6, q7, [pCRow1] + fmla v6.4s, v30.4s, alphaV2 + fmla v7.4s, v31.4s, alphaV3 + stp q6, q7, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT4x8 + fmov s16, wzr + fmov s18, wzr + fmov s20, wzr + fmov s22, s16 + fmov s24, wzr + fmov s26, s16 + fmov s28, s18 + fmov s30, s20 +.endm + +.macro KERNEL4x8_I + ldr q0, [pA], #16 + ldp q4, q5, [pB], #32 + + ldr d2, [pA], #8 + ldr d6, [pB], #8 + ldr d7, [pB, #8] + ldr x22, [pA], #8 + fmul v16.4s, v0.4s, v4.s[0] + ldr x26, [pB], #16 + fmul v18.4s, v0.4s, v4.s[1] + ldr x27, [pB], #8 + fmul v20.4s, v0.4s, v4.s[2] + fmul v22.4s, v0.4s, v4.s[3] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmul v24.4s, v0.4s, v5.s[0] + fmul v26.4s, v0.4s, v5.s[1] + fmul v28.4s, v0.4s, v5.s[2] + fmul v30.4s, v0.4s, v5.s[3] +.endm + +.macro KERNEL4x8_M1 + ldr d2, [pA], #8 + fmov v0.d[1], x20 + ldr d6, [pB], #8 + fmov v4.d[1], x24 + ldr d7, [pB, #8] + fmov v5.d[1], x25 + ldr x22, [pA], #8 + fmla v16.4s, v0.4s, v4.s[0] + ldr x26, [pB], #16 + fmla v18.4s, v0.4s, v4.s[1] + ldr x27, [pB], #8 + fmla v20.4s, v0.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla v24.4s, v0.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] +.endm + +.macro KERNEL4x8_M2 + ldr d0, [pA], #8 + fmov v2.d[1], x22 + ldr d4, [pB], #8 + fmov v6.d[1], x26 + ldr d5, [pB, #8] + fmov v7.d[1], x27 + ldr x20, [pA], #8 + fmla v16.4s, v2.4s, v6.s[0] + ldr x24, [pB], #16 + fmla v18.4s, v2.4s, v6.s[1] + ldr x25, [pB], #8 + fmla v20.4s, v2.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla v24.4s, v2.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] +.endm + +.macro KERNEL4x8_E + fmov v2.d[1], x22 + fmov v6.d[1], x26 + fmov v7.d[1], x27 + fmla v16.4s, v2.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla v24.4s, v2.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] +.endm + +.macro KERNEL4x8_SUB + ldr q0, [pA], #16 + ldp q4, q5, [pB], #32 + + fmla v16.4s, v0.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla v24.4s, v0.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] +.endm + +.macro SAVE4x8 + add pCRow1, pCRow0, LDC + + ldr q0, [pCRow0] + fmla v0.4s, v16.4s, alphaV0 + str q0, [pCRow0] + + add pCRow2, pCRow1, LDC + + ldr q2, [pCRow1] + fmla v2.4s, v18.4s, alphaV2 + str q2, [pCRow1] + + add pCRow1, pCRow2, LDC + + ldr q4, [pCRow2] + fmla v4.4s, v20.4s, alphaV0 + str q4, [pCRow2] + + add pCRow2, pCRow1, LDC + + ldr q6, [pCRow1] + fmla v6.4s, v22.4s, alphaV2 + str q6, [pCRow1] + + add pCRow1, pCRow2, LDC + + ldr q0, [pCRow2] + fmla v0.4s, v24.4s, alphaV0 + str q0, [pCRow2] + + add pCRow2, pCRow1, LDC + + ldr q2, [pCRow1] + fmla v2.4s, v26.4s, alphaV2 + str q2, [pCRow1] + + add pCRow1, pCRow2, LDC + + ldr q4, [pCRow2] + fmla v4.4s, v28.4s, alphaV0 + str q4, [pCRow2] + + ldr q6, [pCRow1] + fmla v6.4s, v30.4s, alphaV2 + str q6, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x8 + fmov s16, wzr + fmov s18, wzr + fmov s20, wzr + fmov s22, s16 + fmov s24, wzr + fmov s26, s16 + fmov s28, s18 + fmov s30, s20 +.endm + +.macro KERNEL2x8_SUB + ldr d0, [pA], #8 + ldp q4, q5, [pB], #32 + + fmla v16.2s, v0.2s, v4.s[0] + fmla v18.2s, v0.2s, v4.s[1] + fmla v20.2s, v0.2s, v4.s[2] + fmla v22.2s, v0.2s, v4.s[3] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla v24.2s, v0.2s, v5.s[0] + fmla v26.2s, v0.2s, v5.s[1] + fmla v28.2s, v0.2s, v5.s[2] + fmla v30.2s, v0.2s, v5.s[3] +.endm + +.macro SAVE2x8 + add pCRow1, pCRow0, LDC + + ldr d0, [pCRow0] + fmla v0.2s, v16.2s, alphaV0 + str d0, [pCRow0] + + add pCRow2, pCRow1, LDC + + ldr d2, [pCRow1] + fmla v2.2s, v18.2s, alphaV2 + str d2, [pCRow1] + + add pCRow1, pCRow2, LDC + + ldr d4, [pCRow2] + fmla v4.2s, v20.2s, alphaV0 + str d4, [pCRow2] + + add pCRow2, pCRow1, LDC + + ldr d6, [pCRow1] + fmla v6.2s, v22.2s, alphaV2 + str d6, [pCRow1] + + add pCRow1, pCRow2, LDC + + ldr d0, [pCRow2] + fmla v0.2s, v24.2s, alphaV0 + str d0, [pCRow2] + + add pCRow2, pCRow1, LDC + + ldr d2, [pCRow1] + fmla v2.2s, v26.2s, alphaV2 + str d2, [pCRow1] + + add pCRow1, pCRow2, LDC + + ldr d4, [pCRow2] + fmla v4.2s, v28.2s, alphaV0 + str d4, [pCRow2] + + ldr d6, [pCRow1] + fmla v6.2s, v30.2s, alphaV2 + str d6, [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x8 + fmov s16, wzr + fmov s18, wzr + fmov s20, wzr + fmov s22, s16 + fmov s24, wzr + fmov s26, s16 + fmov s28, s18 + fmov s30, s20 +.endm + +.macro KERNEL1x8_SUB + ldp q4, q5, [pB], #32 + ldr s0, [pA], #4 + + fmla s16, s0, v4.s[0] + fmla s18, s0, v4.s[1] + fmla s20, s0, v4.s[2] + fmla s22, s0, v4.s[3] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla s24, s0, v5.s[0] + fmla s26, s0, v5.s[1] + fmla s28, s0, v5.s[2] + fmla s30, s0, v5.s[3] +.endm + +.macro SAVE1x8 + add pCRow1, pCRow0, LDC + + ldr s0, [pCRow0] + fmla s0, s16, alphaV0 + str s0, [pCRow0] + + add pCRow2, pCRow1, LDC + + ldr s2, [pCRow1] + fmla s2, s18, alphaV2 + str s2, [pCRow1] + + add pCRow1, pCRow2, LDC + + ldr s4, [pCRow2] + fmla s4, s20, alphaV0 + str s4, [pCRow2] + + add pCRow2, pCRow1, LDC + + ldr s6, [pCRow1] + fmla s6, s22, alphaV2 + str s6, [pCRow1] + + add pCRow1, pCRow2, LDC + + ldr s0, [pCRow2] + fmla s0, s24, alphaV0 + str s0, [pCRow2] + + add pCRow2, pCRow1, LDC + + ldr s2, [pCRow1] + fmla s2, s26, alphaV2 + str s2, [pCRow1] + + add pCRow1, pCRow2, LDC + + ldr s4, [pCRow2] + fmla s4, s28, alphaV0 + str s4, [pCRow2] + + ldr s6, [pCRow1] + fmla s6, s30, alphaV2 + str s6, [pCRow1] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************/ + +.macro INIT8x4 + fmov s16, wzr + fmov s17, wzr + fmov s18, wzr + fmov s19, s16 + fmov s20, wzr + fmov s21, s16 + fmov s22, wzr + fmov s23, s16 +.endm + +.macro KERNEL8x4_I + ldp q0, q1, [pA], #32 + ldr q4, [pB], #16 + + ldr d2, [pA], #8 + ldr d6, [pB], #8 + ldr d3, [pA, #8] + fmul v16.4s, v0.4s, v4.s[0] + ldr x22, [pA], #16 + fmul v17.4s, v1.4s, v4.s[0] + ldr x26, [pB], #8 + fmul v18.4s, v0.4s, v4.s[1] + ldr x23, [pA], #8 + fmul v19.4s, v1.4s, v4.s[1] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmul v20.4s, v0.4s, v4.s[2] + fmul v21.4s, v1.4s, v4.s[2] + fmul v22.4s, v0.4s, v4.s[3] + fmul v23.4s, v1.4s, v4.s[3] +.endm + +.macro KERNEL8x4_M1 + ldr d2, [pA], #8 + fmov v0.d[1], x20 + ldr d6, [pB], #8 + fmov v4.d[1], x24 + ldr d3, [pA, #8] + fmov v1.d[1], x21 + ldr x22, [pA], #16 + fmla v16.4s, v0.4s, v4.s[0] + ldr x26, [pB], #8 + fmla v17.4s, v1.4s, v4.s[0] + ldr x23, [pA], #8 + fmla v18.4s, v0.4s, v4.s[1] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla v19.4s, v1.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v21.4s, v1.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + fmla v23.4s, v1.4s, v4.s[3] +.endm + +.macro KERNEL8x4_M2 + ldr d0, [pA], #8 + fmov v2.d[1], x22 + ldr d4, [pB], #8 + fmov v6.d[1], x26 + ldr d1, [pA, #8] + fmov v3.d[1], x23 + ldr x20, [pA], #16 + fmla v16.4s, v2.4s, v6.s[0] + ldr x24, [pB], #8 + fmla v17.4s, v3.4s, v6.s[0] + ldr x21, [pA], #8 + fmla v18.4s, v2.4s, v6.s[1] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla v19.4s, v3.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v21.4s, v3.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + fmla v23.4s, v3.4s, v6.s[3] +.endm + +.macro KERNEL8x4_E + fmov v2.d[1], x22 + fmov v6.d[1], x26 + fmov v3.d[1], x23 + fmla v16.4s, v2.4s, v6.s[0] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla v17.4s, v3.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v19.4s, v3.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v21.4s, v3.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + fmla v23.4s, v3.4s, v6.s[3] +.endm + +.macro KERNEL8x4_SUB + ldp q0, q1, [pA], #32 + ldr q4, [pB], #16 + + fmla v16.4s, v0.4s, v4.s[0] + fmla v17.4s, v1.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v19.4s, v1.4s, v4.s[1] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla v20.4s, v0.4s, v4.s[2] + fmla v21.4s, v1.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + fmla v23.4s, v1.4s, v4.s[3] +.endm + +.macro SAVE8x4 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + add pCRow1, pCRow0, LDC + + ldp q0, q1, [pCRow0] + fmla v0.4s, v16.4s, alphaV0 + fmla v1.4s, v17.4s, alphaV1 + stp q0, q1, [pCRow0] + + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + add pCRow2, pCRow1, LDC + + ldp q4, q5, [pCRow1] + fmla v4.4s, v18.4s, alphaV0 + fmla v5.4s, v19.4s, alphaV1 + stp q4, q5, [pCRow1] + + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + add pCRow1, pCRow2, LDC + + ldp q0, q1, [pCRow2] + fmla v0.4s, v20.4s, alphaV0 + fmla v1.4s, v21.4s, alphaV1 + stp q0, q1, [pCRow2] + + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ldp q4, q5, [pCRow1] + fmla v4.4s, v22.4s, alphaV0 + fmla v5.4s, v23.4s, alphaV1 + stp q4, q5, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + + +.macro INIT4x4 + fmov s16, wzr + fmov s18, wzr + fmov s20, wzr + fmov s22, wzr +.endm + +.macro KERNEL4x4_I + ldr q0, [pA], #16 + ldr q4, [pB], #16 + + ldr d2, [pA], #8 + ldr d6, [pB], #8 + fmul v16.4s, v0.4s, v4.s[0] + ldr x22, [pA], #8 + fmul v18.4s, v0.4s, v4.s[1] + ldr x26, [pB], #8 + fmul v20.4s, v0.4s, v4.s[2] + fmul v22.4s, v0.4s, v4.s[3] +.endm + +.macro KERNEL4x4_M1 + ldr d2, [pA], #8 + fmov v0.d[1], x20 + ldr d6, [pB], #8 + fmov v4.d[1], x24 + ldr x22, [pA], #8 + ldr x26, [pB], #8 + fmla v16.4s, v0.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] +.endm + +.macro KERNEL4x4_M2 + ldr d0, [pA], #8 + fmov v2.d[1], x22 + ldr d4, [pB], #8 + fmov v6.d[1], x26 + ldr x20, [pA], #8 + ldr x24, [pB], #8 + fmla v16.4s, v2.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] +.endm + +.macro KERNEL4x4_E + fmov v2.d[1], x22 + fmov v6.d[1], x26 + fmla v16.4s, v2.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] +.endm + +.macro KERNEL4x4_SUB + ldr q0, [pA], #16 + ldr q4, [pB], #16 + + fmla v16.4s, v0.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] +.endm + +.macro SAVE4x4 + ldr q0, [pCRow0] + fmla v0.4s, v16.4s, alphaV0 + str q0, [pCRow0] + + add pCRow1, pCRow0, LDC + ldr q1, [pCRow1] + fmla v1.4s, v18.4s, alphaV2 + str q1, [pCRow1] + + add pCRow2, pCRow1, LDC + ldr q2, [pCRow2] + fmla v2.4s, v20.4s, alphaV0 + str q2, [pCRow2] + + add pCRow1, pCRow2, LDC + ldr q3, [pCRow1] + fmla v3.4s, v22.4s, alphaV2 + str q3, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x4 + fmov s16, wzr + fmov s18, wzr + fmov s20, wzr + fmov s22, s16 +.endm + +.macro KERNEL2x4_SUB + ldr d0, [pA], #8 + ldr q4, [pB], #16 + + fmla v16.2s, v0.2s, v4.s[0] + fmla v18.2s, v0.2s, v4.s[1] + fmla v20.2s, v0.2s, v4.s[2] + fmla v22.2s, v0.2s, v4.s[3] +.endm + +.macro SAVE2x4 + ldr d8, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + str d8, [pCRow0] + + add pCRow1, pCRow0, LDC + ldr d12, [pCRow1] + fmla v12.2s, v18.2s, alphaV1 + str d12, [pCRow1] + + add pCRow2, pCRow1, LDC + ldr d8, [pCRow2] + fmla v8.2s, v20.2s, alphaV2 + str d8, [pCRow2] + + add pCRow1, pCRow2, LDC + ldr d12, [pCRow1] + fmla v12.2s, v22.2s, alphaV3 + str d12, [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x4 + fmov s16, wzr + fmov s20, s16 +.endm + +.macro KERNEL1x4_SUB + ldr s0, [pA] + add pA, pA, #4 + + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + + fmla v16.2s, v8.2s, v0.s[0] + fmla v20.2s, v9.2s, v0.s[0] +.endm + +.macro SAVE1x4 + add pCRow1, pCRow0, LDC + ld1 {v8.s}[0], [pCRow0] + ld1 {v8.s}[1], [pCRow1] + fmla v8.2s, v16.2s, alphaV0 + st1 {v8.s}[0], [pCRow0] + st1 {v8.s}[1], [pCRow1] + + add pCRow2, pCRow1, LDC + add pCRow1, pCRow2, LDC + ld1 {v12.s}[0], [pCRow2] + ld1 {v12.s}[1], [pCRow1] + fmla v12.2s, v20.2s, alphaV1 + st1 {v12.s}[0], [pCRow2] + st1 {v12.s}[1], [pCRow1] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************/ + +.macro INIT8x2 + fmov s16, wzr + fmov s17, s16 + fmov s18, s17 + fmov s19, s16 +.endm + +.macro KERNEL8x2_SUB + ldp q0, q1, [pA], #32 + ldr d4, [pB], #8 + + fmla v16.4s, v0.4s, v4.s[0] + fmla v17.4s, v1.4s, v4.s[0] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla v18.4s, v0.4s, v4.s[1] + fmla v19.4s, v1.4s, v4.s[1] +.endm + +.macro SAVE8x2 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + add pCRow1, pCRow0, LDC + + ldp q0, q1, [pCRow0] + fmla v0.4s, v16.4s, alphaV0 + fmla v1.4s, v17.4s, alphaV1 + stp q0, q1, [pCRow0] + + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + add pCRow2, pCRow1, LDC + + ldp q4, q5, [pCRow1] + fmla v4.4s, v18.4s, alphaV0 + fmla v5.4s, v19.4s, alphaV1 + stp q4, q5, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT4x2 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 +.endm + +.macro KERNEL4x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + ld1 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v21.2s, v1.2s, v8.s[1] +.endm + +.macro SAVE4x2 + ld1 {v8.2s, v9.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + fmla v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] + + add pCRow1, pCRow0, LDC + ld1 {v12.2s, v13.2s}, [pCRow1] + fmla v12.2s, v20.2s, alphaV2 + fmla v13.2s, v21.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x2 + fmov s16, wzr + fmov s20, s16 +.endm + +.macro KERNEL2x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + + ld1 {v0.2s}, [pA] + add pA, pA, #8 + + fmla v16.2s, v0.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] +.endm + +.macro SAVE2x2 + ld1 {v8.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] + + add pCRow1 , pCRow0, LDC + ld1 {v12.2s}, [pCRow1] + fmla v12.2s, v20.2s, alphaV1 + st1 {v12.2s}, [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x2 + fmov s16, wzr +.endm + +.macro KERNEL1x2_SUB + ld1 {v8.2s} , [pB] + add pB , pB, #8 + + ldr s0 , [pA] + add pA, pA, #4 + + fmla v16.2s, v8.2s, v0.s[0] +.endm + +.macro SAVE1x2 + add pCRow1 , pCRow0, LDC + ld1 {v8.s}[0], [pCRow0] + ld1 {v8.s}[1], [pCRow1] + fmla v8.2s, v16.2s, alphaV0 + st1 {v8.s}[0], [pCRow0] + st1 {v8.s}[1], [pCRow1] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************/ + +.macro INIT8x1 + fmov s16, wzr + fmov s17, wzr +.endm + +.macro KERNEL8x1_SUB + ldr s4, [pB], #4 + ldp q0, q1, [pA], #32 + + fmla v16.4s, v0.4s, v4.s[0] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla v17.4s, v1.4s, v4.s[0] +.endm + +.macro SAVE8x1 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + ldp q0, q1, [pCRow0] + fmla v0.4s, v16.4s, alphaV0 + fmla v1.4s, v17.4s, alphaV1 + stp q0, q1, [pCRow0] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT4x1 + fmov s16, wzr + fmov s17, s16 +.endm + +.macro KERNEL4x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.2s, v1.2s}, [pA] + add pA , pA, #16 + + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] +.endm + +.macro SAVE4x1 + ld1 {v8.2s, v9.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + fmla v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x1 + fmov s16, wzr +.endm + +.macro KERNEL2x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.2s}, [pA] + add pA , pA, #8 + + fmla v16.2s, v0.2s, v8.s[0] +.endm + +.macro SAVE2x1 + ld1 {v8.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x1 + fmov s16, wzr +.endm + +.macro KERNEL1x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ldr s0, [pA] + add pA , pA, #4 + + fmadd s16, s0, s8, s16 +.endm + +.macro SAVE1x1 + ldr s8, [pCRow0] + fmla s8, s16, alphaV0 + str s8, [pCRow0] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + +.Lsgemm_kernel_begin: + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + fmov alpha0, s0 + fmov alpha1, s0 + fmov alpha2, s0 + fmov alpha3, s0 + + lsl LDC, LDC, #2 // ldc = ldc * 4 + + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #3 // J = J / 8 + cmp counterJ, #0 + ble .Lsgemm_kernel_L4_BEGIN + +/******************************************************************************/ +/******************************************************************************/ + +.Lsgemm_kernel_L8_BEGIN: + mov pCRow0, pC // pCRow0 = C + add pC, pC, LDC, lsl #3 + + mov pA, origPA // pA = start of A array + +/******************************************************************************/ + +.Lsgemm_kernel_L8_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble .Lsgemm_kernel_L8_M4_BEGIN + +.Lsgemm_kernel_L8_M8_20: + + mov pB, origPB + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #2 // is there at least 16 to do? + blt .Lsgemm_kernel_L8_M8_32 + + KERNEL8x8_I // do one in the K + KERNEL8x8_M2 // do another in the K + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + + subs counterL, counterL, #2 + ble .Lsgemm_kernel_L8_M8_22a + .align 5 + +.Lsgemm_kernel_L8_M8_22: + + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L8_M8_22 + +.Lsgemm_kernel_L8_M8_22a: + + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_E + + b .Lsgemm_kernel_L8_M8_44 + +.Lsgemm_kernel_L8_M8_32: + + tst counterL, #1 + ble .Lsgemm_kernel_L8_M8_40 + + KERNEL8x8_I + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_E + + b .Lsgemm_kernel_L8_M8_44 + +.Lsgemm_kernel_L8_M8_40: + + INIT8x8 + +.Lsgemm_kernel_L8_M8_44: + + ands counterL , origK, #7 + ble .Lsgemm_kernel_L8_M8_100 + +.Lsgemm_kernel_L8_M8_46: + + KERNEL8x8_SUB + + subs counterL, counterL, 1 + bgt .Lsgemm_kernel_L8_M8_46 + +.Lsgemm_kernel_L8_M8_100: + + SAVE8x8 + +.Lsgemm_kernel_L8_M8_END: + subs counterI, counterI, #1 + bne .Lsgemm_kernel_L8_M8_20 + +/******************************************************************************/ + +.Lsgemm_kernel_L8_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble .Lsgemm_kernel_L8_END + + tst counterI, #4 + ble .Lsgemm_kernel_L8_M2_BEGIN + +.Lsgemm_kernel_L8_M4_20: + + mov pB, origPB + + asr counterL , origK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt .Lsgemm_kernel_L8_M4_32 + + KERNEL4x8_I // do one in the K + KERNEL4x8_M2 // do another in the K + + subs counterL, counterL, #2 + ble .Lsgemm_kernel_L8_M4_22a + .align 5 + +.Lsgemm_kernel_L8_M4_22: + + KERNEL4x8_M1 + KERNEL4x8_M2 + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L8_M4_22 + +.Lsgemm_kernel_L8_M4_22a: + + KERNEL4x8_M1 + KERNEL4x8_E + + b .Lsgemm_kernel_L8_M4_44 + +.Lsgemm_kernel_L8_M4_32: + + tst counterL, #1 + ble .Lsgemm_kernel_L8_M4_40 + + KERNEL4x8_I + KERNEL4x8_E + + b .Lsgemm_kernel_L8_M4_44 + +.Lsgemm_kernel_L8_M4_40: + + INIT4x8 + +.Lsgemm_kernel_L8_M4_44: + + ands counterL , origK, #1 + ble .Lsgemm_kernel_L8_M4_100 + +.Lsgemm_kernel_L8_M4_46: + + KERNEL4x8_SUB + +.Lsgemm_kernel_L8_M4_100: + + SAVE4x8 + +.Lsgemm_kernel_L8_M4_END: + +/******************************************************************************/ + +.Lsgemm_kernel_L8_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble .Lsgemm_kernel_L8_END + + tst counterI, #2 // counterI = counterI / 2 + ble .Lsgemm_kernel_L8_M1_BEGIN + +.Lsgemm_kernel_L8_M2_20: + + INIT2x8 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lsgemm_kernel_L8_M2_40 + +.Lsgemm_kernel_L8_M2_22: + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L8_M2_22 + + +.Lsgemm_kernel_L8_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lsgemm_kernel_L8_M2_100 + +.Lsgemm_kernel_L8_M2_42: + + KERNEL2x8_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L8_M2_42 + +.Lsgemm_kernel_L8_M2_100: + + SAVE2x8 + +.Lsgemm_kernel_L8_M2_END: + +/******************************************************************************/ + +.Lsgemm_kernel_L8_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble .Lsgemm_kernel_L8_END + +.Lsgemm_kernel_L8_M1_20: + + INIT1x8 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lsgemm_kernel_L8_M1_40 + +.Lsgemm_kernel_L8_M1_22: + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L8_M1_22 + + +.Lsgemm_kernel_L8_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lsgemm_kernel_L8_M1_100 + +.Lsgemm_kernel_L8_M1_42: + + KERNEL1x8_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L8_M1_42 + +.Lsgemm_kernel_L8_M1_100: + + SAVE1x8 + +.Lsgemm_kernel_L8_END: + lsl temp, origK, #5 // B = B + K * 4 * 8 + add origPB, origPB, temp + + subs counterJ, counterJ , #1 // j-- + bgt .Lsgemm_kernel_L8_BEGIN + +/******************************************************************************/ +/******************************************************************************/ + +.Lsgemm_kernel_L4_BEGIN: + + mov counterJ , origN + tst counterJ , #7 + ble .Lsgemm_kernel_L999 + + tst counterJ , #4 + ble .Lsgemm_kernel_L2_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #2 + + mov pA, origPA // pA = A + +/******************************************************************************/ + +.Lsgemm_kernel_L4_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble .Lsgemm_kernel_L4_M4_BEGIN + +.Lsgemm_kernel_L4_M8_20: + + mov pB, origPB + + asr counterL , origK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt .Lsgemm_kernel_L4_M8_32 + + KERNEL8x4_I // do one in the K + KERNEL8x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble .Lsgemm_kernel_L4_M8_22a + .align 5 + +.Lsgemm_kernel_L4_M8_22: + + KERNEL8x4_M1 + KERNEL8x4_M2 + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L4_M8_22 + +.Lsgemm_kernel_L4_M8_22a: + + KERNEL8x4_M1 + KERNEL8x4_E + + b .Lsgemm_kernel_L4_M8_44 + +.Lsgemm_kernel_L4_M8_32: + + tst counterL, #1 + ble .Lsgemm_kernel_L4_M8_40 + + KERNEL8x4_I + KERNEL8x4_E + + b .Lsgemm_kernel_L4_M8_44 + +.Lsgemm_kernel_L4_M8_40: + + INIT8x4 + +.Lsgemm_kernel_L4_M8_44: + + ands counterL , origK, #1 + ble .Lsgemm_kernel_L4_M8_100 + +.Lsgemm_kernel_L4_M8_46: + + KERNEL8x4_SUB + +.Lsgemm_kernel_L4_M8_100: + + SAVE8x4 + +.Lsgemm_kernel_L4_M8_END: + subs counterI, counterI, #1 + bne .Lsgemm_kernel_L4_M8_20 + +/******************************************************************************/ + +.Lsgemm_kernel_L4_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble .Lsgemm_kernel_L4_END + + tst counterI, #4 + ble .Lsgemm_kernel_L4_M2_BEGIN + +.Lsgemm_kernel_L4_M4_20: + + mov pB, origPB + + asr counterL , origK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt .Lsgemm_kernel_L4_M4_32 + + KERNEL4x4_I // do one in the K + KERNEL4x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble .Lsgemm_kernel_L4_M4_22a + .align 5 + +.Lsgemm_kernel_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L4_M4_22 + +.Lsgemm_kernel_L4_M4_22a: + + KERNEL4x4_M1 + KERNEL4x4_E + + b .Lsgemm_kernel_L4_M4_44 + +.Lsgemm_kernel_L4_M4_32: + + tst counterL, #1 + ble .Lsgemm_kernel_L4_M4_40 + + KERNEL4x4_I + KERNEL4x4_E + + b .Lsgemm_kernel_L4_M4_44 + +.Lsgemm_kernel_L4_M4_40: + + INIT4x4 + +.Lsgemm_kernel_L4_M4_44: + + ands counterL , origK, #1 + ble .Lsgemm_kernel_L4_M4_100 + +.Lsgemm_kernel_L4_M4_46: + + KERNEL4x4_SUB + +.Lsgemm_kernel_L4_M4_100: + + SAVE4x4 + +.Lsgemm_kernel_L4_M4_END: + +/******************************************************************************/ + +.Lsgemm_kernel_L4_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble .Lsgemm_kernel_L4_END + + tst counterI, #2 // counterI = counterI / 2 + ble .Lsgemm_kernel_L4_M1_BEGIN + +.Lsgemm_kernel_L4_M2_20: + + INIT2x4 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lsgemm_kernel_L4_M2_40 + +.Lsgemm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L4_M2_22 + + +.Lsgemm_kernel_L4_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lsgemm_kernel_L4_M2_100 + +.Lsgemm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L4_M2_42 + +.Lsgemm_kernel_L4_M2_100: + + SAVE2x4 + +.Lsgemm_kernel_L4_M2_END: + +/******************************************************************************/ + +.Lsgemm_kernel_L4_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble .Lsgemm_kernel_L4_END + +.Lsgemm_kernel_L4_M1_20: + + INIT1x4 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lsgemm_kernel_L4_M1_40 + +.Lsgemm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L4_M1_22 + + +.Lsgemm_kernel_L4_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lsgemm_kernel_L4_M1_100 + +.Lsgemm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L4_M1_42 + +.Lsgemm_kernel_L4_M1_100: + + SAVE1x4 + +.Lsgemm_kernel_L4_END: + add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 + +/******************************************************************************/ +/******************************************************************************/ + +.Lsgemm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble .Lsgemm_kernel_L999 + + tst counterJ , #2 + ble .Lsgemm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #1 + + mov pA, origPA // pA = A + +/******************************************************************************/ + +.Lsgemm_kernel_L2_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI,#0 + ble .Lsgemm_kernel_L2_M4_BEGIN + +.Lsgemm_kernel_L2_M8_20: + + INIT8x2 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble .Lsgemm_kernel_L2_M8_40 + .align 5 + +.Lsgemm_kernel_L2_M8_22: + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L2_M8_22 + + +.Lsgemm_kernel_L2_M8_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lsgemm_kernel_L2_M8_100 + +.Lsgemm_kernel_L2_M8_42: + + KERNEL8x2_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L2_M8_42 + +.Lsgemm_kernel_L2_M8_100: + + SAVE8x2 + +.Lsgemm_kernel_L2_M8_END: + + subs counterI, counterI, #1 + bgt .Lsgemm_kernel_L2_M8_20 + +/******************************************************************************/ + +.Lsgemm_kernel_L2_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble .Lsgemm_kernel_L2_END + + tst counterI, #4 + ble .Lsgemm_kernel_L2_M2_BEGIN + +.Lsgemm_kernel_L2_M4_20: + + INIT4x2 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble .Lsgemm_kernel_L2_M4_40 + .align 5 + +.Lsgemm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L2_M4_22 + + +.Lsgemm_kernel_L2_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lsgemm_kernel_L2_M4_100 + +.Lsgemm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L2_M4_42 + +.Lsgemm_kernel_L2_M4_100: + + SAVE4x2 + +.Lsgemm_kernel_L2_M4_END: + +/******************************************************************************/ + +.Lsgemm_kernel_L2_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble .Lsgemm_kernel_L2_END + + tst counterI, #2 // counterI = counterI / 2 + ble .Lsgemm_kernel_L2_M1_BEGIN + +.Lsgemm_kernel_L2_M2_20: + + INIT2x2 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble .Lsgemm_kernel_L2_M2_40 + +.Lsgemm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L2_M2_22 + + +.Lsgemm_kernel_L2_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lsgemm_kernel_L2_M2_100 + +.Lsgemm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L2_M2_42 + +.Lsgemm_kernel_L2_M2_100: + + SAVE2x2 + +.Lsgemm_kernel_L2_M2_END: + +/******************************************************************************/ + +.Lsgemm_kernel_L2_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble .Lsgemm_kernel_L2_END + +.Lsgemm_kernel_L2_M1_20: + + INIT1x2 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble .Lsgemm_kernel_L2_M1_40 + +.Lsgemm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L2_M1_22 + + +.Lsgemm_kernel_L2_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lsgemm_kernel_L2_M1_100 + +.Lsgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L2_M1_42 + +.Lsgemm_kernel_L2_M1_100: + + SAVE1x2 + +.Lsgemm_kernel_L2_END: + + add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 + +/******************************************************************************/ +/******************************************************************************/ + +.Lsgemm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble .Lsgemm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + + mov pA, origPA // pA = A + +/******************************************************************************/ + +.Lsgemm_kernel_L1_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 + cmp counterI, #0 + ble .Lsgemm_kernel_L1_M4_BEGIN + +.Lsgemm_kernel_L1_M8_20: + + INIT8x1 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lsgemm_kernel_L1_M8_40 + .align 5 + +.Lsgemm_kernel_L1_M8_22: + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L1_M8_22 + + +.Lsgemm_kernel_L1_M8_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lsgemm_kernel_L1_M8_100 + +.Lsgemm_kernel_L1_M8_42: + + KERNEL8x1_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L1_M8_42 + +.Lsgemm_kernel_L1_M8_100: + + SAVE8x1 + +.Lsgemm_kernel_L1_M8_END: + + subs counterI, counterI, #1 + bgt .Lsgemm_kernel_L1_M8_20 + +/******************************************************************************/ + +.Lsgemm_kernel_L1_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble .Lsgemm_kernel_L1_END + + tst counterI, #4 + ble .Lsgemm_kernel_L1_M2_BEGIN + +.Lsgemm_kernel_L1_M4_20: + + INIT4x1 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lsgemm_kernel_L1_M4_40 + .align 5 + +.Lsgemm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L1_M4_22 + + +.Lsgemm_kernel_L1_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lsgemm_kernel_L1_M4_100 + +.Lsgemm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L1_M4_42 + +.Lsgemm_kernel_L1_M4_100: + + SAVE4x1 + +.Lsgemm_kernel_L1_M4_END: + +/******************************************************************************/ + +.Lsgemm_kernel_L1_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble .Lsgemm_kernel_L1_END + + tst counterI, #2 // counterI = counterI / 2 + ble .Lsgemm_kernel_L1_M1_BEGIN + +.Lsgemm_kernel_L1_M2_20: + + INIT2x1 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lsgemm_kernel_L1_M2_40 + +.Lsgemm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L1_M2_22 + + +.Lsgemm_kernel_L1_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lsgemm_kernel_L1_M2_100 + +.Lsgemm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L1_M2_42 + +.Lsgemm_kernel_L1_M2_100: + + SAVE2x1 + +.Lsgemm_kernel_L1_M2_END: + +/******************************************************************************/ + +.Lsgemm_kernel_L1_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble .Lsgemm_kernel_L1_END + +.Lsgemm_kernel_L1_M1_20: + + INIT1x1 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lsgemm_kernel_L1_M1_40 + +.Lsgemm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L1_M1_22 + + +.Lsgemm_kernel_L1_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lsgemm_kernel_L1_M1_100 + +.Lsgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L1_M1_42 + +.Lsgemm_kernel_L1_M1_100: + + SAVE1x1 + +.Lsgemm_kernel_L1_END: + +/******************************************************************************/ + +.Lsgemm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/sgemm_ncopy_8.S b/kernel/arm64/sgemm_ncopy_8.S new file mode 100644 index 000000000..f99b1d992 --- /dev/null +++ b/kernel/arm64/sgemm_ncopy_8.S @@ -0,0 +1,562 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER +#include "common.h" + +#define M x0 +#define N x1 +#define A00 x2 +#define LDA x3 +#define B00 x4 + +#define A01 x5 +#define A02 x6 +#define A03 x7 +#define A04 x8 +#define A05 x9 +#define A06 x10 +#define A07 x11 +#define A08 x12 + +#define I x13 +#define J x14 +#define K x15 + +#define TEMP1 x16 +#define TEMP2 x17 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro SAVE_REGS + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] +.endm + +.macro RESTORE_REGS + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) +.endm + +.macro COPY4x8 + ldr q0, [A01], #16 + ldr q1, [A02], #16 + ins v8.s[0], v0.s[0] + ins v10.s[0], v0.s[1] + ins v12.s[0], v0.s[2] + ins v14.s[0], v0.s[3] + ins v8.s[1], v1.s[0] + ins v10.s[1], v1.s[1] + ins v12.s[1], v1.s[2] + ins v14.s[1], v1.s[3] + + ldr q2, [A03], #16 + ldr q3, [A04], #16 + ins v8.s[2], v2.s[0] + ins v10.s[2], v2.s[1] + ins v12.s[2], v2.s[2] + ins v14.s[2], v2.s[3] + ins v8.s[3], v3.s[0] + ins v10.s[3], v3.s[1] + ins v12.s[3], v3.s[2] + ins v14.s[3], v3.s[3] + + ldr q4, [A05], #16 + ldr q5, [A06], #16 + ins v9.s[0], v4.s[0] + ins v11.s[0], v4.s[1] + ins v13.s[0], v4.s[2] + ins v15.s[0], v4.s[3] + ins v9.s[1], v5.s[0] + ins v11.s[1], v5.s[1] + ins v13.s[1], v5.s[2] + ins v15.s[1], v5.s[3] + + ldr q6, [A07], #16 + ldr q7, [A08], #16 + ins v9.s[2], v6.s[0] + ins v11.s[2], v6.s[1] + ins v13.s[2], v6.s[2] + ins v15.s[2], v6.s[3] + ins v9.s[3], v7.s[0] + ins v11.s[3], v7.s[1] + ins v13.s[3], v7.s[2] + ins v15.s[3], v7.s[3] + + st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00], #64 + st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [B00], #64 +.endm + +.macro COPY2x8 + ldr d0, [A01], #8 + ldr d1, [A02], #8 + ins v8.s[0], v0.s[0] + ins v10.s[0], v0.s[1] + ins v8.s[1], v1.s[0] + ins v10.s[1], v1.s[1] + + ldr d2, [A03], #8 + ldr d3, [A04], #8 + ins v8.s[2], v2.s[0] + ins v10.s[2], v2.s[1] + ins v8.s[3], v3.s[0] + ins v10.s[3], v3.s[1] + + ldr d4, [A05], #8 + ldr d5, [A06], #8 + ins v9.s[0], v4.s[0] + ins v11.s[0], v4.s[1] + ins v9.s[1], v5.s[0] + ins v11.s[1], v5.s[1] + + ldr d6, [A07], #8 + ldr d7, [A08], #8 + ins v9.s[2], v6.s[0] + ins v11.s[2], v6.s[1] + ins v9.s[3], v7.s[0] + ins v11.s[3], v7.s[1] + + st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00], #64 +.endm + +.macro COPY1x8 + ldr s0, [A01], #4 + ldr s1, [A02], #4 + ins v8.s[0], v0.s[0] + ins v8.s[1], v1.s[0] + + ldr s2, [A03], #4 + ldr s3, [A04], #4 + ins v8.s[2], v2.s[0] + ins v8.s[3], v3.s[0] + + ldr s4, [A05], #4 + ldr s5, [A06], #4 + ins v9.s[0], v4.s[0] + ins v9.s[1], v5.s[0] + + ldr s6, [A07], #4 + ldr s7, [A08], #4 + ins v9.s[2], v6.s[0] + ins v9.s[3], v7.s[0] + + st1 {v8.4s, v9.4s}, [B00], #32 +.endm + +.macro COPY4x4 + ldr q0, [A01], #16 + ldr q1, [A02], #16 + ins v8.s[0], v0.s[0] + ins v9.s[0], v0.s[1] + ins v10.s[0], v0.s[2] + ins v11.s[0], v0.s[3] + ins v8.s[1], v1.s[0] + ins v9.s[1], v1.s[1] + ins v10.s[1], v1.s[2] + ins v11.s[1], v1.s[3] + + ldr q2, [A03], #16 + ldr q3, [A04], #16 + ins v8.s[2], v2.s[0] + ins v9.s[2], v2.s[1] + ins v10.s[2], v2.s[2] + ins v11.s[2], v2.s[3] + ins v8.s[3], v3.s[0] + ins v9.s[3], v3.s[1] + ins v10.s[3], v3.s[2] + ins v11.s[3], v3.s[3] + + st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00], #64 +.endm + +.macro COPY2x4 + ldr d0, [A01], #8 + ldr d1, [A02], #8 + ins v8.s[0], v0.s[0] + ins v9.s[0], v0.s[1] + ins v8.s[1], v1.s[0] + ins v9.s[1], v1.s[1] + + ldr d2, [A03], #8 + ldr d3, [A04], #8 + ins v8.s[2], v2.s[0] + ins v9.s[2], v2.s[1] + ins v8.s[3], v3.s[0] + ins v9.s[3], v3.s[1] + + st1 {v8.4s, v9.4s}, [B00], #32 +.endm + +.macro COPY1x4 + ldr s0, [A01], #4 + ldr s1, [A02], #4 + ins v8.s[0], v0.s[0] + ins v8.s[1], v1.s[0] + + ldr s2, [A03], #4 + ldr s3, [A04], #4 + ins v8.s[2], v2.s[0] + ins v8.s[3], v3.s[0] + + st1 {v8.4s}, [B00], #16 +.endm + +.macro COPY4x2 + ldr q0, [A01], #16 + ldr q1, [A02], #16 + ins v8.s[0], v0.s[0] + ins v9.s[0], v0.s[1] + ins v10.s[0], v0.s[2] + ins v11.s[0], v0.s[3] + ins v8.s[1], v1.s[0] + ins v9.s[1], v1.s[1] + ins v10.s[1], v1.s[2] + ins v11.s[1], v1.s[3] + + st1 {v8.2s, v9.2s, v10.2s, v11.2s}, [B00], #32 +.endm + +.macro COPY2x2 + ldr d0, [A01], #8 + ldr d1, [A02], #8 + ins v8.s[0], v0.s[0] + ins v9.s[0], v0.s[1] + ins v8.s[1], v1.s[0] + ins v9.s[1], v1.s[1] + + st1 {v8.2s, v9.2s}, [B00], #16 +.endm + +.macro COPY1x2 + ldr s0, [A01], #4 + ldr s1, [A02], #4 + ins v8.s[0], v0.s[0] + ins v8.s[1], v1.s[0] + + st1 {v8.2s}, [B00], #8 +.endm + +.macro COPY1x1 + ldr s0, [A01], #4 + str s0, [B00], #4 +.endm + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + SAVE_REGS + + lsl LDA, LDA, #2 // LDA = LDA * SIZE + +.Lsgemm_ncopy_L8_BEGIN: + + asr J, N, #3 // J = N / 8 + cmp J, #0 + ble .Lsgemm_ncopy_L4_BEGIN + + .align 5 +.Lsgemm_ncopy_L8_M4_BEGIN: + + mov A01, A00 + add A02, A01, LDA + add A03, A02, LDA + add A04, A03, LDA + add A05, A04, LDA + add A06, A05, LDA + add A07, A06, LDA + add A08, A07, LDA + add A00, A08, LDA + + asr I, M, #2 // I = M / 4 + cmp I, #0 + ble .Lsgemm_ncopy_L8_M4_40 + + asr K, M, #4 // K = M / 16(cacheline) + mov TEMP1, A01 + + .align 5 +.Lsgemm_tcopy_L8_warnup_1: + + ldr s0, [TEMP1], #64 + + subs K, K, #1 + bgt .Lsgemm_tcopy_L8_warnup_1 + + asr K, M, #4 // K = M / 16(cacheline) + mov TEMP1, A02 + + .align 5 +.Lsgemm_tcopy_L8_warnup_2: + + ldr s0, [TEMP1], #64 + + subs K, K, #1 + bgt .Lsgemm_tcopy_L8_warnup_2 + + asr K, M, #4 // K = M / 16(cacheline) + mov TEMP1, A03 + + .align 5 +.Lsgemm_tcopy_L8_warnup_3: + + ldr s0, [TEMP1], #64 + + subs K, K, #1 + bgt .Lsgemm_tcopy_L8_warnup_3 + + asr K, M, #4 // K = M / 16(cacheline) + mov TEMP1, A04 + + .align 5 +.Lsgemm_tcopy_L8_warnup_4: + + ldr s0, [TEMP1], #64 + + subs K, K, #1 + bgt .Lsgemm_tcopy_L8_warnup_4 + + asr K, M, #4 // K = M / 16(cacheline) + mov TEMP1, A05 + + .align 5 +.Lsgemm_tcopy_L8_warnup_5: + + ldr s0, [TEMP1], #64 + + subs K, K, #1 + bgt .Lsgemm_tcopy_L8_warnup_5 + + asr K, M, #4 // K = M / 16(cacheline) + mov TEMP1, A06 + + .align 5 +.Lsgemm_tcopy_L8_warnup_6: + + ldr s0, [TEMP1], #64 + + subs K, K, #1 + bgt .Lsgemm_tcopy_L8_warnup_6 + + asr K, M, #4 // K = M / 16(cacheline) + mov TEMP1, A07 + + .align 5 +.Lsgemm_tcopy_L8_warnup_7: + + ldr s0, [TEMP1], #64 + + subs K, K, #1 + bgt .Lsgemm_tcopy_L8_warnup_7 + + asr K, M, #4 // K = M / 16(cacheline) + mov TEMP1, A08 + + .align 5 +.Lsgemm_tcopy_L8_warnup_8: + + ldr s0, [TEMP1], #64 + + subs K, K, #1 + bgt .Lsgemm_tcopy_L8_warnup_8 + + .align 5 +.Lsgemm_ncopy_L8_M4_20: + + COPY4x8 + + subs I, I, #1 + bne .Lsgemm_ncopy_L8_M4_20 + +.Lsgemm_ncopy_L8_M4_40: + + and I, M, #2 + cmp I, #0 + ble .Lsgemm_ncopy_L8_M4_60 + + COPY2x8 + +.Lsgemm_ncopy_L8_M4_60: + + and I, M, #1 + cmp I, #0 + ble .Lsgemm_ncopy_L8_M4_END + + COPY1x8 + +.Lsgemm_ncopy_L8_M4_END: + + subs J , J, #1 // j-- + bne .Lsgemm_ncopy_L8_M4_BEGIN + +/*********************************************************************************************/ + +.Lsgemm_ncopy_L4_BEGIN: + + tst N, #7 + ble .Lsgemm_ncopy_L999 + + tst N, #4 + ble .Lsgemm_ncopy_L2_BEGIN + +.Lsgemm_ncopy_L4_M4_BEGIN: + mov A01, A00 + add A02, A01, LDA + add A03, A02, LDA + add A04, A03, LDA + add A00, A04, LDA + + asr I, M, #2 // I = M / 4 + cmp I, #0 + ble .Lsgemm_ncopy_L4_M4_40 + + .align 5 +.Lsgemm_ncopy_L4_M4_20: + + COPY4x4 + + subs I, I, #1 + bne .Lsgemm_ncopy_L4_M4_20 + +.Lsgemm_ncopy_L4_M4_40: + + and I, M, #2 + cmp I, #0 + ble .Lsgemm_ncopy_L4_M4_60 + + COPY2x4 + +.Lsgemm_ncopy_L4_M4_60: + + and I, M, #1 + cmp I, #0 + ble .Lsgemm_ncopy_L4_M4_END + + COPY1x4 + +.Lsgemm_ncopy_L4_M4_END: + + +/*********************************************************************************************/ + +.Lsgemm_ncopy_L2_BEGIN: + + tst N, #2 + ble .Lsgemm_ncopy_L1_BEGIN + +.Lsgemm_ncopy_L2_M4_BEGIN: + + mov A01, A00 + add A02, A01, LDA + add A00, A02, LDA + + asr I, M, #2 // I = M / 4 + cmp I, #0 + ble .Lsgemm_ncopy_L2_M4_40 + + .align 5 +.Lsgemm_ncopy_L2_M4_20: + + COPY4x2 + + subs I , I , #1 + bne .Lsgemm_ncopy_L2_M4_20 + + +.Lsgemm_ncopy_L2_M4_40: + + and I, M, #2 + cmp I, #0 + ble .Lsgemm_ncopy_L2_M4_60 + + COPY2x2 + +.Lsgemm_ncopy_L2_M4_60: + + and I, M, #1 + cmp I, #0 + ble .Lsgemm_ncopy_L2_M4_END + + COPY1x2 + +.Lsgemm_ncopy_L2_M4_END: + +.Lsgemm_ncopy_L1_BEGIN: + + tst N, #1 + ble .Lsgemm_ncopy_L999 + +.Lsgemm_ncopy_L1_M1_BEGIN: + + mov A01, A00 + + mov I, M + cmp I, #0 + ble .Lsgemm_ncopy_L1_M1_END + + .align 5 +.Lsgemm_ncopy_L1_M1_20: + + COPY1x1 + + subs I, I, #1 + bne .Lsgemm_ncopy_L1_M1_20 + +.Lsgemm_ncopy_L1_M1_END: + +.Lsgemm_ncopy_L999: + + mov x0, #0 + RESTORE_REGS + ret + + EPILOGUE diff --git a/kernel/arm64/sgemm_tcopy_8.S b/kernel/arm64/sgemm_tcopy_8.S new file mode 100644 index 000000000..7d81ba266 --- /dev/null +++ b/kernel/arm64/sgemm_tcopy_8.S @@ -0,0 +1,707 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER +#include "common.h" + +#define M x0 +#define N x1 +#define A x2 +#define LDA x3 +#define B x4 + +#define M8 x5 + +#define A01 x6 +#define A02 x7 +#define A03 x8 +#define A04 x9 +#define A05 x10 +#define A06 x11 +#define A07 x12 +#define A08 x13 + +#define B01 x14 +#define B02 x15 +#define B03 x16 +#define B04 x17 +#define B00 x22 + + +#define I x18 +#define J x19 + +#define TEMP1 x20 + +#define A_PREFETCH 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ +.macro SAVE_REGS + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] +.endm + +.macro RESTORE_REGS + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) +.endm + +/*************************************************************************************************************************/ + +.macro COPY8x8 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + prfm PLDL1KEEP, [A03, #A_PREFETCH] + prfm PLDL1KEEP, [A04, #A_PREFETCH] + prfm PLDL1KEEP, [A05, #A_PREFETCH] + prfm PLDL1KEEP, [A06, #A_PREFETCH] + prfm PLDL1KEEP, [A07, #A_PREFETCH] + prfm PLDL1KEEP, [A08, #A_PREFETCH] + + ldp q0, q1, [A01] + ldp q2, q3, [A02] + add A01, A01, #32 + add A02, A02, #32 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] + add TEMP1, B00, #64 + + ldp q4, q5, [A03] + ldp q6, q7, [A04] + add A03, A03, #32 + add A04, A04, #32 + + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ldp q8, q9, [A05] + ldp q10, q11, [A06] + add A05, A05, #32 + add A06, A06, #32 + + st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ldp q12, q13, [A07] + ldp q14, q15, [A08] + add A07, A07, #32 + add A08, A08, #32 + + st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + add B00, B00, M8 +.endm + +.macro COPY4x8 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + //prfm PLDL1KEEP, [A05, #A_PREFETCH] + //prfm PLDL1KEEP, [A06, #A_PREFETCH] + //prfm PLDL1KEEP, [A07, #A_PREFETCH] + //prfm PLDL1KEEP, [A08, #A_PREFETCH] + + ldr q0, [A01] + ldr q1, [A02] + ldr q2, [A03] + ldr q3, [A04] + add A01, A01, #16 + add A02, A02, #16 + add A03, A03, #16 + add A04, A04, #16 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01] + add B01, B01, #64 + + ldr q4, [A05] + ldr q5, [A06] + ldr q6, [A07] + ldr q7, [A08] + + add A05, A05, #16 + add A06, A06, #16 + add A07, A07, #16 + add A08, A08, #16 + + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [B01] + add B01, B01, #64 +.endm + +.macro COPY2x8 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + //prfm PLDL1KEEP, [A05, #A_PREFETCH] + //prfm PLDL1KEEP, [A06, #A_PREFETCH] + //prfm PLDL1KEEP, [A07, #A_PREFETCH] + //prfm PLDL1KEEP, [A08, #A_PREFETCH] + + ldr d0, [A01] + ldr d1, [A02] + ldr d2, [A03] + ldr d3, [A04] + + add A01, A01, #8 + add A02, A02, #8 + add A03, A03, #8 + add A04, A04, #8 + + stp d0, d1, [B02] + add B02, B02, #16 + stp d2, d3, [B02] + add B02, B02, #16 + + ldr d4, [A05] + ldr d5, [A06] + ldr d6, [A07] + ldr d7, [A08] + + add A05, A05, #8 + add A06, A06, #8 + add A07, A07, #8 + add A08, A08, #8 + + stp d4, d5, [B02] + add B02, B02, #16 + stp d6, d7, [B02] + add B02, B02, #16 + +.endm + +.macro COPY1x8 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + //prfm PLDL1KEEP, [A05, #A_PREFETCH] + //prfm PLDL1KEEP, [A06, #A_PREFETCH] + //prfm PLDL1KEEP, [A07, #A_PREFETCH] + //prfm PLDL1KEEP, [A08, #A_PREFETCH] + + ldr s0, [A01] + ldr s1, [A02] + ldr s2, [A03] + ldr s3, [A04] + + add A01, A01, #4 + add A02, A02, #4 + add A03, A03, #4 + add A04, A04, #4 + + stp s0, s1, [B03] + add B03, B03, #8 + stp s2, s3, [B03] + add B03, B03, #8 + + ldr s4, [A05] + ldr s5, [A06] + ldr s6, [A07] + ldr s7, [A08] + + ldr d4, [A05], #8 + ldr d5, [A06], #8 + ldr d6, [A07], #8 + ldr d7, [A08], #8 + + stp s4, s5, [B03] + add B03, B03, #8 + stp s6, s7, [B03] + add B03, B03, #8 + +.endm + +/*************************************************************************************************************************/ + +.macro COPY8x4 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + prfm PLDL1KEEP, [A03, #A_PREFETCH] + prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ldp q0, q1, [A01] + ldp q2, q3, [A02] + add A01, A01, #32 + add A02, A02, #32 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] + add TEMP1, B00, #64 + + ldp q4, q5, [A03] + ldp q6, q7, [A04] + add A03, A03, #32 + add A04, A04, #32 + + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + add B00, B00, M8 +.endm + +.macro COPY4x4 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ldr q0, [A01] + ldr q1, [A02] + ldr q2, [A03] + ldr q3, [A04] + add A01, A01, #16 + add A02, A02, #16 + add A03, A03, #16 + add A04, A04, #16 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01] + + add B01, B01, #64 +.endm + +.macro COPY2x4 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ldr d0, [A01] + ldr d1, [A02] + ldr d2, [A03] + ldr d3, [A04] + + add A01, A01, #8 + add A02, A02, #8 + add A03, A03, #8 + add A04, A04, #8 + + stp d0, d1, [B02] + add B02, B02, #16 + stp d2, d3, [B02] + + add B02, B02, #16 +.endm + +.macro COPY1x4 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ldr s0, [A01] + ldr s1, [A02] + ldr s2, [A03] + ldr s3, [A04] + + add A01, A01, #4 + add A02, A02, #4 + add A03, A03, #4 + add A04, A04, #4 + + stp s0, s1, [B03] + add B03, B03, #8 + stp s2, s3, [B03] + add B03, B03, #8 + +.endm + +/*************************************************************************************************************************/ + +.macro COPY8x2 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ld1 {v0.4s, v1.4s}, [A01] + ld1 {v2.4s, v3.4s}, [A02] + add A01, A01, #32 + add A02, A02, #32 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] + add B00, B00, M8 +.endm + +.macro COPY4x2 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ldr q0, [A01] + ldr q1, [A02] + add A01, A01, #16 + add A02, A02, #16 + + stp q0, q1, [B01] + add B01, B01, #32 +.endm + +.macro COPY2x2 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ldr d0, [A01] + ldr d1, [A02] + + add A01, A01, #8 + add A02, A02, #8 + + stp d0, d1, [B02] + add B02, B02, #16 +.endm + +.macro COPY1x2 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ldr s0, [A01] + ldr s1, [A02] + + add A01, A01, #4 + add A02, A02, #4 + + stp s0, s1, [B03] + + add B03, B03, #8 +.endm + +/*************************************************************************************************************************/ + +.macro COPY8x1 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ldp q0, q1, [A01] + add A01, A01, #32 + stp q0, q1, [B00] + + add B00, B00, M8 +.endm + +.macro COPY4x1 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ldr q0, [A01] + add A01, A01, #16 + str q0, [B01] + + add B01, B01, #16 +.endm + +.macro COPY2x1 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ldr d0, [A01] + add A01, A01, #8 + str d0, [B02] + + add B02, B02, #8 +.endm + +.macro COPY1x1 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ldr s0, [A01] + add A01, A01, #4 + str s0, [B03] + + add B03, B03, #4 +.endm + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + SAVE_REGS + + lsl LDA, LDA, #2 // LDA = LDA * SIZE + + lsl TEMP1, M, #2 // TEMP1 = M * SIZE + + and B01 , N , #-8 + and B02 , N , #-4 + and B03 , N , #-2 + + mul B01, B01, TEMP1 + mul B02, B02, TEMP1 + mul B03, B03, TEMP1 + + add B01 , B01, B + add B02 , B02, B + add B03 , B03, B + + lsl M8, M, #5 // M8 = M * 8 * SIZE + +.Lsgemm_tcopy_L8_BEGIN: + + asr J, M, #3 // J = M / 8 + cmp J, #0 + ble .Lsgemm_tcopy_L4_BEGIN + + .align 5 +.Lsgemm_tcopy_L8_M8_BEGIN: + + mov A01, A + add A02, A01, LDA + add A03, A02, LDA + add A04, A03, LDA + add A05, A04, LDA + add A06, A05, LDA + add A07, A06, LDA + add A08, A07, LDA + add A, A08, LDA + + mov B00, B + add B, B00, #256 // B = B + 8 * 8 * SIZE + + asr I, N, #3 // I = N / 8 + cmp I, #0 + ble .Lsgemm_tcopy_L8_M8_40 + + .align 5 +.Lsgemm_tcopy_L8_M8_20: + + COPY8x8 + + subs I , I , #1 + bne .Lsgemm_tcopy_L8_M8_20 + +.Lsgemm_tcopy_L8_M8_40: + + tst N , #4 + ble .Lsgemm_tcopy_L8_M8_60 + + COPY4x8 + +.Lsgemm_tcopy_L8_M8_60: + + tst N , #2 + ble .Lsgemm_tcopy_L8_M8_80 + + COPY2x8 + +.Lsgemm_tcopy_L8_M8_80: + + tst N, #1 + ble .Lsgemm_tcopy_L8_M8_END + + COPY1x8 + +.Lsgemm_tcopy_L8_M8_END: + + subs J, J, #1 // j-- + bne .Lsgemm_tcopy_L8_M8_BEGIN + +/*********************************************************************************************/ + +.Lsgemm_tcopy_L4_BEGIN: + + tst M, #7 + ble .Lsgemm_tcopy_L999 + + tst M, #4 + ble .Lsgemm_tcopy_L2_BEGIN + +.Lsgemm_tcopy_L4_M8_BEGIN: + + mov A01, A + add A02, A01, LDA + add A03, A02, LDA + add A04, A03, LDA + add A, A04, LDA + + mov B00, B + add B, B00, #128 // B = B + 4 * 8 * SIZE + + asr I, N, #3 // I = N / 8 + cmp I, #0 + ble .Lsgemm_tcopy_L4_M8_40 + + .align 5 +.Lsgemm_tcopy_L4_M8_20: + + COPY8x4 + + subs I , I , #1 + bne .Lsgemm_tcopy_L4_M8_20 + +.Lsgemm_tcopy_L4_M8_40: + + tst N , #4 + ble .Lsgemm_tcopy_L4_M8_60 + + COPY4x4 + +.Lsgemm_tcopy_L4_M8_60: + + tst N , #2 + ble .Lsgemm_tcopy_L4_M8_80 + + COPY2x4 + +.Lsgemm_tcopy_L4_M8_80: + + tst N , #1 + ble .Lsgemm_tcopy_L4_M8_END + + COPY1x4 + + +.Lsgemm_tcopy_L4_M8_END: + +/*********************************************************************************************/ + +.Lsgemm_tcopy_L2_BEGIN: + + tst M, #3 + ble .Lsgemm_tcopy_L999 + + tst M, #2 + ble .Lsgemm_tcopy_L1_BEGIN + +.Lsgemm_tcopy_L2_M16_BEGIN: + + mov A01, A + add A02, A01, LDA + add A, A02, LDA + + mov B00, B + add B, B00, #64 // B = B + 2 * 8 * SIZE + + asr I, N, #3 // I = N / 8 + cmp I, #0 + ble .Lsgemm_tcopy_L2_M8_40 + + .align 5 +.Lsgemm_tcopy_L2_M8_20: + + COPY8x2 + + subs I , I , #1 + bne .Lsgemm_tcopy_L2_M8_20 + +.Lsgemm_tcopy_L2_M8_40: + + tst N , #4 + ble .Lsgemm_tcopy_L2_M8_60 + + COPY4x2 + +.Lsgemm_tcopy_L2_M8_60: + + tst N , #2 + ble .Lsgemm_tcopy_L2_M8_80 + + COPY2x2 + +.Lsgemm_tcopy_L2_M8_80: + + tst N , #1 + ble .Lsgemm_tcopy_L2_M8_END + + COPY1x2 + +.Lsgemm_tcopy_L2_M8_END: + +/*********************************************************************************************/ + +.Lsgemm_tcopy_L1_BEGIN: + + tst M, #1 + ble .Lsgemm_tcopy_L999 + + +.Lsgemm_tcopy_L1_M16_BEGIN: + + mov A01, A // A01 = A + mov B00, B + + asr I, N, #3 // I = M / 8 + cmp I, #0 + ble .Lsgemm_tcopy_L1_M8_40 + + .align 5 +.Lsgemm_tcopy_L1_M8_20: + + COPY8x1 + + subs I , I , #1 + bne .Lsgemm_tcopy_L1_M8_20 + +.Lsgemm_tcopy_L1_M8_40: + + tst N , #4 + ble .Lsgemm_tcopy_L1_M8_60 + + COPY4x1 + +.Lsgemm_tcopy_L1_M8_60: + + tst N , #2 + ble .Lsgemm_tcopy_L1_M8_80 + + COPY2x1 + +.Lsgemm_tcopy_L1_M8_80: + + tst N , #1 + ble .Lsgemm_tcopy_L1_M8_END + + COPY1x1 + + +.Lsgemm_tcopy_L1_M8_END: + +.Lsgemm_tcopy_L999: + + mov x0, #0 // set return value + RESTORE_REGS + ret + + EPILOGUE diff --git a/kernel/arm64/strmm_kernel_8x8_cortexa53.S b/kernel/arm64/strmm_kernel_8x8_cortexa53.S new file mode 100644 index 000000000..4b84623f3 --- /dev/null +++ b/kernel/arm64/strmm_kernel_8x8_cortexa53.S @@ -0,0 +1,2823 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 x7 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset) */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define offset x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pA x15 +#define temp x16 +#define tempOffset x17 +#define tempK x18 + +#define alpha0 s10 +#define alphaV0 v10.s[0] +#define alpha1 s11 +#define alphaV1 v11.s[0] +#define alpha2 s14 +#define alphaV2 v14.s[0] +#define alpha3 s15 +#define alphaV3 v15.s[0] + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pA +// 16 temp +// 17 tempOffset +// 18 must save tempK +// 19 must save +// 20 must save pA0_2, pA0_3 +// 21 must save pA0_6, pA0_7 +// 22 must save pA1_2, pA1_3 +// 23 must save pA1_6, pA1_7 +// 24 must save pB0_2, pB0_3 +// 25 must save pB0_6, pB0_7 +// 26 must save pB1_2, pB1_3 +// 27 must save pB1_6, pB1_7 +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA0_0, pA0_1, pA0_2, pA0_3 +//v01 pA0_4, pA0_5, pA0_6, pA0_7 +//v02 pA1_0, pA1_1, pA1_2, pA1_3 +//v03 pA1_4, pA1_5, pA1_6, pA1_7 +//v04 pB0_0, pB0_1, pB0_2, pB0_3 +//v05 pB0_4, pB0_5, pB0_6, pB0_7 +//v06 pB1_0, pB1_1, pB1_2, pB1_3 +//v07 pB1_4, pB1_5, pB1_6, pB1_7 +//v08 must save +//v09 must save +//v10 must save ALPHA0 +//v11 must save ALPHA1 +//v12 must save +//v13 must save +//v14 must save ALPHA2 +//v15 must save ALPHA3 +//v16 must save C00, C01, C02, C03 +//v17 must save C04, C05, C06, C07 +//v18 C08, C09, C10, C11 +//v19 C12, C13, C14, C15 +//v20 C16, C17, C18, C19 +//v21 C20, C21, C22, C23 +//v22 C24, C25, C26, C27 +//v23 C28, C29, C30, C31 +//v24 C32, C33, C34, C35 +//v25 C36, C37, C38, C39 +//v26 C40, C41, C42, C43 +//v27 C44, C45, C46, C47 +//v28 C48, C49, C50, C51 +//v29 C52, C53, C54, C55 +//v30 C56, C57, C58, C59 +//v31 C60, C61, C62, C63 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INIT8x8 + fmov s16, wzr + fmov s17, wzr + fmov s18, s16 + fmov s19, s17 + fmov s20, wzr + fmov s21, s16 + fmov s22, s17 + fmov s23, s18 + fmov s24, wzr + fmov s25, s16 + fmov s26, s17 + fmov s27, s18 + fmov s28, wzr + fmov s29, s16 + fmov s30, s17 + fmov s31, s18 +.endm + +.macro KERNEL8x8_I + ld1 {v0.4s, v1.4s}, [pA], #32 + ld1 {v4.4s, v5.4s}, [pB], #32 + ldr d2, [pA], #8 + ldr d6, [pB], #8 + ldr d3, [pA, #8] + ldr d7, [pB, #8] + + ldr x22, [pA], #16 + fmul v16.4s, v0.4s, v4.s[0] + ldr x26, [pB], #16 + fmul v17.4s, v1.4s, v4.s[0] + ldr x23, [pA], #8 + fmul v18.4s, v0.4s, v4.s[1] + ldr x27, [pB], #8 + fmul v19.4s, v1.4s, v4.s[1] + fmul v20.4s, v0.4s, v4.s[2] + fmul v21.4s, v1.4s, v4.s[2] + fmul v22.4s, v0.4s, v4.s[3] + fmul v23.4s, v1.4s, v4.s[3] + fmul v24.4s, v0.4s, v5.s[0] + fmul v25.4s, v1.4s, v5.s[0] + fmul v26.4s, v0.4s, v5.s[1] + fmul v27.4s, v1.4s, v5.s[1] + fmul v28.4s, v0.4s, v5.s[2] + fmul v29.4s, v1.4s, v5.s[2] + fmul v30.4s, v0.4s, v5.s[3] + fmul v31.4s, v1.4s, v5.s[3] +.endm + +.macro KERNEL8x8_M1 + ldr d2, [pA], #8 + fmov v0.d[1], x20 + ldr d6, [pB], #8 + fmov v4.d[1], x24 + ldr d3, [pA, #8] + fmov v1.d[1], x21 + ldr d7, [pB, #8] + fmov v5.d[1], x25 + fmla v16.4s, v0.4s, v4.s[0] + ldr x22, [pA], #16 + fmla v17.4s, v1.4s, v4.s[0] + ldr x26, [pB], #16 + fmla v18.4s, v0.4s, v4.s[1] + ldr x23, [pA], #8 + fmla v19.4s, v1.4s, v4.s[1] + ldr x27, [pB], #8 + fmla v20.4s, v0.4s, v4.s[2] + fmla v21.4s, v1.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + fmla v23.4s, v1.4s, v4.s[3] + fmla v24.4s, v0.4s, v5.s[0] + fmla v25.4s, v1.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v27.4s, v1.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v29.4s, v1.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] + fmla v31.4s, v1.4s, v5.s[3] +.endm + +.macro KERNEL8x8_M2 + ldr d0, [pA], #8 + fmov v2.d[1], x22 + ldr d4, [pB], #8 + fmov v6.d[1], x26 + ldr d1, [pA, #8] + fmov v3.d[1], x23 + ldr d5, [pB, #8] + fmov v7.d[1], x27 + fmla v16.4s, v2.4s, v6.s[0] + ldr x20, [pA], #16 + fmla v17.4s, v3.4s, v6.s[0] + ldr x24, [pB], #16 + fmla v18.4s, v2.4s, v6.s[1] + ldr x21, [pA], #8 + fmla v19.4s, v3.4s, v6.s[1] + ldr x25, [pB], #8 + fmla v20.4s, v2.4s, v6.s[2] + fmla v21.4s, v3.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + fmla v23.4s, v3.4s, v6.s[3] + fmla v24.4s, v2.4s, v7.s[0] + fmla v25.4s, v3.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v27.4s, v3.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v29.4s, v3.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] + fmla v31.4s, v3.4s, v7.s[3] +.endm + +.macro KERNEL8x8_E + fmov v2.d[1], x22 + fmov v6.d[1], x26 + fmov v3.d[1], x23 + fmov v7.d[1], x27 + fmla v16.4s, v2.4s, v6.s[0] + fmla v17.4s, v3.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v19.4s, v3.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v21.4s, v3.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + fmla v23.4s, v3.4s, v6.s[3] + fmla v24.4s, v2.4s, v7.s[0] + fmla v25.4s, v3.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v27.4s, v3.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v29.4s, v3.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] + fmla v31.4s, v3.4s, v7.s[3] +.endm + +.macro KERNEL8x8_SUB + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v4.s[0] + fmla v17.4s, v1.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v19.4s, v1.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v21.4s, v1.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + fmla v23.4s, v1.4s, v4.s[3] + fmla v24.4s, v0.4s, v5.s[0] + fmla v25.4s, v1.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v27.4s, v1.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v29.4s, v1.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] + fmla v31.4s, v1.4s, v5.s[3] +.endm + +.macro SAVE8x8 + add pCRow1, pCRow0, LDC + + fmul v0.4s, v16.4s, alphaV0 + fmul v1.4s, v17.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + fmul v2.4s, v18.4s, alphaV2 + fmul v3.4s, v19.4s, alphaV3 + st1 {v2.4s, v3.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + fmul v4.4s, v20.4s, alphaV0 + fmul v5.4s, v21.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + fmul v6.4s, v22.4s, alphaV2 + fmul v7.4s, v23.4s, alphaV3 + st1 {v6.4s, v7.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + fmul v0.4s, v24.4s, alphaV0 + fmul v1.4s, v25.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + fmul v2.4s, v26.4s, alphaV2 + fmul v3.4s, v27.4s, alphaV3 + st1 {v2.4s, v3.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + fmul v4.4s, v28.4s, alphaV0 + fmul v5.4s, v29.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow2] + + fmul v6.4s, v30.4s, alphaV2 + fmul v7.4s, v31.4s, alphaV3 + st1 {v6.4s, v7.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + + +.macro INIT4x8 + fmov s16, wzr + fmov s18, wzr + fmov s20, wzr + fmov s22, s16 + fmov s24, wzr + fmov s26, s16 + fmov s28, s18 + fmov s30, s20 +.endm + +.macro KERNEL4x8_I + ld1 {v0.4s}, [pA], #16 + ld1 {v4.4s, v5.4s}, [pB], #32 + + ldr d2, [pA], #8 + ldr d6, [pB], #8 + ldr d7, [pB, #8] + ldr x21, [pA], #8 + fmul v16.4s, v0.4s, v4.s[0] + ldr x26, [pB], #16 + fmul v18.4s, v0.4s, v4.s[1] + ldr x27, [pB], #8 + fmul v20.4s, v0.4s, v4.s[2] + fmul v22.4s, v0.4s, v4.s[3] + fmul v24.4s, v0.4s, v5.s[0] + fmul v26.4s, v0.4s, v5.s[1] + fmul v28.4s, v0.4s, v5.s[2] + fmul v30.4s, v0.4s, v5.s[3] +.endm + +.macro KERNEL4x8_M1 + ldr d2, [pA], #8 + fmov v0.d[1], x20 + ldr d6, [pB], #8 + fmov v4.d[1], x24 + ldr d7, [pB, #8] + fmov v5.d[1], x25 + fmla v16.4s, v0.4s, v4.s[0] + ldr x21, [pA], #8 + fmla v18.4s, v0.4s, v4.s[1] + ldr x26, [pB], #16 + fmla v20.4s, v0.4s, v4.s[2] + ldr x27, [pB], #8 + fmla v22.4s, v0.4s, v4.s[3] + fmla v24.4s, v0.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] +.endm + +.macro KERNEL4x8_M2 + ldr d0, [pA], #8 + fmov v2.d[1], x21 + ldr d4, [pB], #8 + fmov v6.d[1], x26 + ldr d5, [pB, #8] + fmov v7.d[1], x27 + fmla v16.4s, v2.4s, v6.s[0] + ldr x20, [pA], #8 + fmla v18.4s, v2.4s, v6.s[1] + ldr x24, [pB], #16 + fmla v20.4s, v2.4s, v6.s[2] + ldr x25, [pB], #8 + fmla v22.4s, v2.4s, v6.s[3] + fmla v24.4s, v2.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] +.endm + +.macro KERNEL4x8_E + fmov v2.d[1], x21 + fmov v6.d[1], x26 + fmov v7.d[1], x27 + fmla v16.4s, v2.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + fmla v24.4s, v2.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] +.endm + +.macro KERNEL4x8_SUB + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + fmla v24.4s, v0.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] +.endm + +.macro SAVE4x8 + add pCRow1, pCRow0, LDC + + + fmul v0.4s, v16.4s, alphaV0 + st1 {v0.4s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + + fmul v2.4s, v18.4s, alphaV2 + st1 {v2.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul v4.4s, v20.4s, alphaV0 + st1 {v4.4s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + + fmul v6.4s, v22.4s, alphaV2 + st1 {v6.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul v0.4s, v24.4s, alphaV0 + st1 {v0.4s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + + fmul v2.4s, v26.4s, alphaV2 + st1 {v2.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul v4.4s, v28.4s, alphaV0 + st1 {v4.4s}, [pCRow2] + + + fmul v6.4s, v30.4s, alphaV2 + st1 {v6.4s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x8 + fmov s16, wzr + fmov s18, wzr + fmov s20, wzr + fmov s22, s16 + fmov s24, wzr + fmov s26, s16 + fmov s28, s18 + fmov s30, s20 +.endm + +.macro KERNEL2x8_SUB + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ld1 {v0.2s}, [pA] + add pA, pA, #8 + + fmla v16.2s, v0.2s, v4.s[0] + fmla v18.2s, v0.2s, v4.s[1] + fmla v20.2s, v0.2s, v4.s[2] + fmla v22.2s, v0.2s, v4.s[3] + fmla v24.2s, v0.2s, v5.s[0] + fmla v26.2s, v0.2s, v5.s[1] + fmla v28.2s, v0.2s, v5.s[2] + fmla v30.2s, v0.2s, v5.s[3] +.endm + +.macro SAVE2x8 + add pCRow1, pCRow0, LDC + + + fmul v0.2s, v16.2s, alphaV0 + st1 {v0.2s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + + fmul v2.2s, v18.2s, alphaV2 + st1 {v2.2s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul v4.2s, v20.2s, alphaV0 + st1 {v4.2s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + + fmul v6.2s, v22.2s, alphaV2 + st1 {v6.2s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul v0.2s, v24.2s, alphaV0 + st1 {v0.2s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + + fmul v2.2s, v26.2s, alphaV2 + st1 {v2.2s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul v4.2s, v28.2s, alphaV0 + st1 {v4.2s}, [pCRow2] + + + fmul v6.2s, v30.2s, alphaV2 + st1 {v6.2s}, [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x8 + fmov s16, wzr + fmov s18, wzr + fmov s20, wzr + fmov s22, s16 + fmov s24, wzr + fmov s26, s16 + fmov s28, s18 + fmov s30, s20 +.endm + +.macro KERNEL1x8_SUB + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ldr s0, [pA] + add pA, pA, #4 + + fmla s16, s0, v4.s[0] + fmla s18, s0, v4.s[1] + fmla s20, s0, v4.s[2] + fmla s22, s0, v4.s[3] + fmla s24, s0, v5.s[0] + fmla s26, s0, v5.s[1] + fmla s28, s0, v5.s[2] + fmla s30, s0, v5.s[3] +.endm + +.macro SAVE1x8 + add pCRow1, pCRow0, LDC + + + fmul s0, s16, alphaV0 + str s0, [pCRow0] + + add pCRow2, pCRow1, LDC + + + fmul s2, s18, alphaV2 + str s2, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul s4, s20, alphaV0 + str s4, [pCRow2] + + add pCRow2, pCRow1, LDC + + + fmul s6, s22, alphaV2 + str s6, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul s0, s24, alphaV0 + str s0, [pCRow2] + + add pCRow2, pCRow1, LDC + + + fmul s2, s26, alphaV2 + str s2, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul s4, s28, alphaV0 + str s4, [pCRow2] + + + fmul s6, s30, alphaV2 + str s6, [pCRow1] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************/ + +.macro INIT8x4 + fmov s16, wzr + fmov s17, wzr + fmov s20, wzr + fmov s21, s16 + fmov s24, wzr + fmov s25, s16 + fmov s28, wzr + fmov s29, s16 +.endm + +.macro KERNEL8x4_I + ld1 {v8.4s}, [pB], #16 + ld1 {v0.4s, v1.4s}, [pA], #32 + + ldr d9, [pB], #8 + ldr d2, [pA], #8 + ldr d3, [pA, #8] + fmul v16.4s, v0.4s, v8.s[0] + ldr x25, [pB], #8 + fmul v17.4s, v1.4s, v8.s[0] + ldr x22, [pA], #16 + fmul v20.4s, v0.4s, v8.s[1] + ldr x23, [pA], #8 + fmul v21.4s, v1.4s, v8.s[1] + fmul v24.4s, v0.4s, v8.s[2] + fmul v25.4s, v1.4s, v8.s[2] + fmul v28.4s, v0.4s, v8.s[3] + fmul v29.4s, v1.4s, v8.s[3] +.endm + +.macro KERNEL8x4_M1 + ldr d9, [pB], #8 + fmov v8.d[1], x24 + ldr d2, [pA], #8 + fmov v0.d[1], x20 + ldr d3, [pA, #8] + fmov v1.d[1], x21 + fmla v16.4s, v0.4s, v8.s[0] + ldr x25, [pB], #8 + fmla v17.4s, v1.4s, v8.s[0] + ldr x22, [pA], #16 + fmla v20.4s, v0.4s, v8.s[1] + ldr x23, [pA], #8 + fmla v21.4s, v1.4s, v8.s[1] + fmla v24.4s, v0.4s, v8.s[2] + fmla v25.4s, v1.4s, v8.s[2] + fmla v28.4s, v0.4s, v8.s[3] + fmla v29.4s, v1.4s, v8.s[3] +.endm + +.macro KERNEL8x4_M2 + ldr d8, [pB], #8 + fmov v9.d[1], x25 + ldr d0, [pA], #8 + fmov v2.d[1], x22 + ldr d1, [pA, #8] + fmov v3.d[1], x23 + fmla v16.4s, v2.4s, v9.s[0] + ldr x24, [pB], #8 + fmla v17.4s, v3.4s, v9.s[0] + ldr x20, [pA], #16 + fmla v20.4s, v2.4s, v9.s[1] + ldr x21, [pA], #8 + fmla v21.4s, v3.4s, v9.s[1] + fmla v24.4s, v2.4s, v9.s[2] + fmla v25.4s, v3.4s, v9.s[2] + fmla v28.4s, v2.4s, v9.s[3] + fmla v29.4s, v3.4s, v9.s[3] +.endm + +.macro KERNEL8x4_E + fmov v9.d[1], x25 + fmov v2.d[1], x22 + fmov v3.d[1], x23 + fmla v16.4s, v2.4s, v9.s[0] + fmla v17.4s, v3.4s, v9.s[0] + fmla v20.4s, v2.4s, v9.s[1] + fmla v21.4s, v3.4s, v9.s[1] + fmla v24.4s, v2.4s, v9.s[2] + fmla v25.4s, v3.4s, v9.s[2] + fmla v28.4s, v2.4s, v9.s[3] + fmla v29.4s, v3.4s, v9.s[3] +.endm + +.macro KERNEL8x4_SUB + ld1 {v8.4s}, [pB], #16 + ld1 {v0.4s, v1.4s}, [pA], #32 + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] + fmla v24.4s, v0.4s, v8.s[2] + fmla v25.4s, v1.4s, v8.s[2] + fmla v28.4s, v0.4s, v8.s[3] + fmla v29.4s, v1.4s, v8.s[3] +.endm + +.macro SAVE8x4 + add pCRow1, pCRow0, LDC + + + fmul v0.4s, v16.4s, alphaV0 + fmul v1.4s, v17.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + + fmul v4.4s, v20.4s, alphaV0 + fmul v5.4s, v21.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul v0.4s, v24.4s, alphaV0 + fmul v1.4s, v25.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow2] + + + fmul v4.4s, v28.4s, alphaV0 + fmul v5.4s, v29.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + + +.macro INIT4x4 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 + fmov s24, s17 + fmov s25, s16 + fmov s28, s17 + fmov s29, s16 +.endm + +.macro KERNEL4x4_I + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + fmul v16.2s, v0.2s, v8.s[0] + fmul v29.2s, v1.2s, v9.s[1] + + fmul v20.2s, v0.2s, v8.s[1] + fmul v25.2s, v1.2s, v9.s[0] + + fmul v24.2s, v0.2s, v9.s[0] + fmul v21.2s, v1.2s, v8.s[1] + + fmul v28.2s, v0.2s, v9.s[1] + fmul v17.2s, v1.2s, v8.s[0] + + ld1 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + ld1 {v4.2s, v5.2s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL4x4_M1 + fmla v16.2s, v0.2s, v8.s[0] + fmla v29.2s, v1.2s, v9.s[1] + + ld1 {v12.2s, v13.2s}, [pB] // For next round + add pB, pB, #16 + + fmla v20.2s, v0.2s, v8.s[1] + fmla v25.2s, v1.2s, v9.s[0] + + ld1 {v4.2s, v5.2s}, [pA] // For next round + add pA, pA, #16 + + fmla v24.2s, v0.2s, v9.s[0] + fmla v21.2s, v1.2s, v8.s[1] + + prfm PLDL1KEEP, [pB, #512] + + fmla v28.2s, v0.2s, v9.s[1] + fmla v17.2s, v1.2s, v8.s[0] +.endm + +.macro KERNEL4x4_M2 + fmla v16.2s, v4.2s, v12.s[0] + fmla v29.2s, v5.2s, v13.s[1] + + ld1 {v8.2s, v9.2s}, [pB] // For next round + add pB, pB, #16 + + fmla v20.2s, v4.2s, v12.s[1] + fmla v25.2s, v5.2s, v13.s[0] + + ld1 {v0.2s, v1.2s}, [pA] // For next round + add pA, pA, #16 + + fmla v24.2s, v4.2s, v13.s[0] + fmla v21.2s, v5.2s, v12.s[1] + + prfm PLDL1KEEP, [pA, #512] + + fmla v28.2s, v4.2s, v13.s[1] + fmla v17.2s, v5.2s, v12.s[0] +.endm + +.macro KERNEL4x4_E + fmla v16.2s, v4.2s, v12.s[0] + fmla v29.2s, v5.2s, v13.s[1] + + fmla v20.2s, v4.2s, v12.s[1] + fmla v25.2s, v5.2s, v13.s[0] + + fmla v24.2s, v4.2s, v13.s[0] + fmla v21.2s, v5.2s, v12.s[1] + + fmla v28.2s, v4.2s, v13.s[1] + fmla v17.2s, v5.2s, v12.s[0] +.endm + +.macro KERNEL4x4_SUB + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + fmla v16.2s, v0.2s, v8.s[0] + fmla v29.2s, v1.2s, v9.s[1] + + fmla v20.2s, v0.2s, v8.s[1] + fmla v25.2s, v1.2s, v9.s[0] + + fmla v24.2s, v0.2s, v9.s[0] + fmla v21.2s, v1.2s, v8.s[1] + + fmla v28.2s, v0.2s, v9.s[1] + fmla v17.2s, v1.2s, v8.s[0] +.endm + +.macro SAVE4x4 + + fmul v8.2s, v16.2s, alphaV0 + fmul v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] + + add pCRow1, pCRow0, LDC + + fmul v12.2s, v20.2s, alphaV2 + fmul v13.2s, v21.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] + + add pCRow2, pCRow1, LDC + + fmul v8.2s, v24.2s, alphaV0 + fmul v9.2s, v25.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow2] + + add pCRow1, pCRow2, LDC + + fmul v12.2s, v28.2s, alphaV2 + fmul v13.2s, v29.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x4 + fmov s16, wzr + fmov s20, s16 + fmov s24, s20 + fmov s28, s16 +.endm + +.macro KERNEL2x4_SUB + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s}, [pA] + add pA, pA, #8 + + fmla v16.2s, v0.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v28.2s, v0.2s, v9.s[1] +.endm + +.macro SAVE2x4 + + fmul v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] + + add pCRow1, pCRow0, LDC + + fmul v12.2s, v20.2s, alphaV1 + st1 {v12.2s}, [pCRow1] + + add pCRow2, pCRow1, LDC + + fmul v8.2s, v24.2s, alphaV2 + st1 {v8.2s}, [pCRow2] + + add pCRow1, pCRow2, LDC + + fmul v12.2s, v28.2s, alphaV3 + st1 {v12.2s}, [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x4 + fmov s16, wzr + fmov s20, s16 +.endm + +.macro KERNEL1x4_SUB + ldr s0, [pA] + add pA, pA, #4 + + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + + fmla v16.2s, v8.2s, v0.s[0] + fmla v20.2s, v9.2s, v0.s[0] +.endm + +.macro SAVE1x4 + add pCRow1, pCRow0, LDC + + + fmul v8.2s, v16.2s, alphaV0 + st1 {v8.s}[0], [pCRow0] + st1 {v8.s}[1], [pCRow1] + + add pCRow2, pCRow1, LDC + add pCRow1, pCRow2, LDC + + + fmul v12.2s, v20.2s, alphaV1 + st1 {v12.s}[0], [pCRow2] + st1 {v12.s}[1], [pCRow1] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************/ + +.macro INIT8x2 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 +.endm + +.macro KERNEL8x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] +.endm + +.macro SAVE8x2 + add pCRow1, pCRow0, LDC + + + fmul v0.4s, v16.4s, alphaV0 + fmul v1.4s, v17.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + + fmul v4.4s, v20.4s, alphaV0 + fmul v5.4s, v21.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT4x2 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 +.endm + +.macro KERNEL4x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + ld1 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v21.2s, v1.2s, v8.s[1] +.endm + +.macro SAVE4x2 + + fmul v8.2s, v16.2s, alphaV0 + fmul v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] + + add pCRow1, pCRow0, LDC + + fmul v12.2s, v20.2s, alphaV2 + fmul v13.2s, v21.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x2 + fmov s16, wzr + fmov s20, s16 +.endm + +.macro KERNEL2x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + + ld1 {v0.2s}, [pA] + add pA, pA, #8 + + fmla v16.2s, v0.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] +.endm + +.macro SAVE2x2 + + fmul v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] + + add pCRow1 , pCRow0, LDC + + fmul v12.2s, v20.2s, alphaV1 + st1 {v12.2s}, [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x2 + fmov s16, wzr +.endm + +.macro KERNEL1x2_SUB + ld1 {v8.2s} , [pB] + add pB , pB, #8 + + ldr s0 , [pA] + add pA, pA, #4 + + fmla v16.2s, v8.2s, v0.s[0] +.endm + +.macro SAVE1x2 + add pCRow1 , pCRow0, LDC + + + fmul v8.2s, v16.2s, alphaV0 + st1 {v8.s}[0], [pCRow0] + st1 {v8.s}[1], [pCRow1] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************/ + +.macro INIT8x1 + fmov s16, wzr + fmov s17, wzr +.endm + +.macro KERNEL8x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] +.endm + +.macro SAVE8x1 + + fmul v0.4s, v16.4s, alphaV0 + fmul v1.4s, v17.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow0] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT4x1 + fmov s16, wzr + fmov s17, s16 +.endm + +.macro KERNEL4x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.2s, v1.2s}, [pA] + add pA , pA, #16 + + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] +.endm + +.macro SAVE4x1 + + fmul v8.2s, v16.2s, alphaV0 + fmul v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x1 + fmov s16, wzr +.endm + +.macro KERNEL2x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.2s}, [pA] + add pA , pA, #8 + + fmla v16.2s, v0.2s, v8.s[0] +.endm + +.macro SAVE2x1 + + fmul v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x1 + fmov s16, wzr +.endm + +.macro KERNEL1x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ldr s0, [pA] + add pA , pA, #4 + + fmadd s16, s0, s8, s16 +.endm + +.macro SAVE1x1 + + fmul s8, s16, alpha0 + str s8, [pCRow0] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + +.Lstrmm_kernel_begin: + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + fmov alpha0, s0 + fmov alpha1, s0 + fmov alpha2, s0 + fmov alpha3, s0 + + lsl LDC, LDC, #2 // ldc = ldc * 4 + +#if !defined(LEFT) + neg tempOffset, offset +#endif + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #3 // J = J / 8 + cmp counterJ, #0 + ble .Lstrmm_kernel_L4_BEGIN + +/******************************************************************************/ +/******************************************************************************/ + +.Lstrmm_kernel_L8_BEGIN: + mov pCRow0, pC // pCRow0 = C + add pC, pC, LDC, lsl #3 + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = start of A array + +/******************************************************************************/ + +.Lstrmm_kernel_L8_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble .Lstrmm_kernel_L8_M4_BEGIN + +.Lstrmm_kernel_L8_M8_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pA, pA, temp + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #8 +#else + add tempK, tempOffset, #8 +#endif + + asr counterL , tempK, #3 // L = K / 8 + cmp counterL , #2 // is there at least 16 to do? + blt .Lstrmm_kernel_L8_M8_32 + + KERNEL8x8_I // do one in the K + KERNEL8x8_M2 // do another in the K + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + + subs counterL, counterL, #2 + ble .Lstrmm_kernel_L8_M8_22a + .align 5 + +.Lstrmm_kernel_L8_M8_22: + + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L8_M8_22 + +.Lstrmm_kernel_L8_M8_22a: + + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_E + + b .Lstrmm_kernel_L8_M8_44 + +.Lstrmm_kernel_L8_M8_32: + + tst counterL, #1 + ble .Lstrmm_kernel_L8_M8_40 + + KERNEL8x8_I + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_E + + b .Lstrmm_kernel_L8_M8_44 + +.Lstrmm_kernel_L8_M8_40: + + INIT8x8 + +.Lstrmm_kernel_L8_M8_44: + + ands counterL , tempK, #7 + ble .Lstrmm_kernel_L8_M8_100 + +.Lstrmm_kernel_L8_M8_46: + + KERNEL8x8_SUB + + subs counterL, counterL, 1 + bgt .Lstrmm_kernel_L8_M8_46 + +.Lstrmm_kernel_L8_M8_100: + + SAVE8x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #8 +#else + sub tempK, tempK, #8 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #8 +#endif + +.Lstrmm_kernel_L8_M8_END: + subs counterI, counterI, #1 + bne .Lstrmm_kernel_L8_M8_20 + +/******************************************************************************/ + +.Lstrmm_kernel_L8_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble .Lstrmm_kernel_L8_END + + tst counterI, #4 + ble .Lstrmm_kernel_L8_M2_BEGIN + +.Lstrmm_kernel_L8_M4_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pA, pA, temp + lsl temp, tempOffset, #5 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #8 +#endif + + asr counterL , tempK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt .Lstrmm_kernel_L8_M4_32 + + KERNEL4x8_I // do one in the K + KERNEL4x8_M2 // do another in the K + + subs counterL, counterL, #2 + ble .Lstrmm_kernel_L8_M4_22a + .align 5 + +.Lstrmm_kernel_L8_M4_22: + + KERNEL4x8_M1 + KERNEL4x8_M2 + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L8_M4_22 + +.Lstrmm_kernel_L8_M4_22a: + + KERNEL4x8_M1 + KERNEL4x8_E + + b .Lstrmm_kernel_L8_M4_44 + +.Lstrmm_kernel_L8_M4_32: + + tst counterL, #1 + ble .Lstrmm_kernel_L8_M4_40 + + KERNEL4x8_I + KERNEL4x8_E + + b .Lstrmm_kernel_L8_M4_44 + +.Lstrmm_kernel_L8_M4_40: + + INIT4x8 + +.Lstrmm_kernel_L8_M4_44: + + ands counterL , tempK, #1 + ble .Lstrmm_kernel_L8_M4_100 + +.Lstrmm_kernel_L8_M4_46: + + KERNEL4x8_SUB + +.Lstrmm_kernel_L8_M4_100: + + SAVE4x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #8 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +.Lstrmm_kernel_L8_M4_END: + +/******************************************************************************/ + +.Lstrmm_kernel_L8_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble .Lstrmm_kernel_L8_END + + tst counterI, #2 // counterI = counterI / 2 + ble .Lstrmm_kernel_L8_M1_BEGIN + +.Lstrmm_kernel_L8_M2_20: + + INIT2x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pA, pA, temp + lsl temp, tempOffset, #5 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #8 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lstrmm_kernel_L8_M2_40 + +.Lstrmm_kernel_L8_M2_22: + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L8_M2_22 + + +.Lstrmm_kernel_L8_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lstrmm_kernel_L8_M2_100 + +.Lstrmm_kernel_L8_M2_42: + + KERNEL2x8_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L8_M2_42 + +.Lstrmm_kernel_L8_M2_100: + + SAVE2x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #8 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +.Lstrmm_kernel_L8_M2_END: + +/******************************************************************************/ + +.Lstrmm_kernel_L8_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble .Lstrmm_kernel_L8_END + +.Lstrmm_kernel_L8_M1_20: + + INIT1x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #2 + add pA, pA, temp + lsl temp, tempOffset, #5 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #8 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lstrmm_kernel_L8_M1_40 + +.Lstrmm_kernel_L8_M1_22: + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L8_M1_22 + + +.Lstrmm_kernel_L8_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lstrmm_kernel_L8_M1_100 + +.Lstrmm_kernel_L8_M1_42: + + KERNEL1x8_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L8_M1_42 + +.Lstrmm_kernel_L8_M1_100: + + SAVE1x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #8 +#endif + lsl temp, tempK, #2 + add pA, pA, temp + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif + +.Lstrmm_kernel_L8_END: + lsl temp, origK, #5 // B = B + K * 4 * 8 + add origPB, origPB, temp + +#if !defined(LEFT) + add tempOffset, tempOffset, #8 +#endif + + subs counterJ, counterJ , #1 // j-- + bgt .Lstrmm_kernel_L8_BEGIN + +/******************************************************************************/ +/******************************************************************************/ + +.Lstrmm_kernel_L4_BEGIN: + + mov counterJ , origN + tst counterJ , #7 + ble .Lstrmm_kernel_L999 + + tst counterJ , #4 + ble .Lstrmm_kernel_L2_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #2 + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = A + +/******************************************************************************/ + +.Lstrmm_kernel_L4_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble .Lstrmm_kernel_L4_M4_BEGIN + +.Lstrmm_kernel_L4_M8_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pA, pA, temp + lsl temp, tempOffset, #4 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #8 +#else + add tempK, tempOffset, #4 +#endif + + asr counterL , tempK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt .Lstrmm_kernel_L4_M8_32 + + KERNEL8x4_I // do one in the K + KERNEL8x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble .Lstrmm_kernel_L4_M8_22a + .align 5 + +.Lstrmm_kernel_L4_M8_22: + + KERNEL8x4_M1 + KERNEL8x4_M2 + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L4_M8_22 + +.Lstrmm_kernel_L4_M8_22a: + + KERNEL8x4_M1 + KERNEL8x4_E + + b .Lstrmm_kernel_L4_M8_44 + +.Lstrmm_kernel_L4_M8_32: + + tst counterL, #1 + ble .Lstrmm_kernel_L4_M8_40 + + KERNEL8x4_I + KERNEL8x4_E + + b .Lstrmm_kernel_L4_M8_44 + +.Lstrmm_kernel_L4_M8_40: + + INIT8x4 + +.Lstrmm_kernel_L4_M8_44: + + ands counterL , tempK, #1 + ble .Lstrmm_kernel_L4_M8_100 + +.Lstrmm_kernel_L4_M8_46: + + KERNEL8x4_SUB + +.Lstrmm_kernel_L4_M8_100: + + SAVE8x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #8 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #8 +#endif +.Lstrmm_kernel_L4_M8_END: + subs counterI, counterI, #1 + bne .Lstrmm_kernel_L4_M8_20 + +/******************************************************************************/ + +.Lstrmm_kernel_L4_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble .Lstrmm_kernel_L4_END + + tst counterI, #4 + ble .Lstrmm_kernel_L4_M2_BEGIN + +.Lstrmm_kernel_L4_M4_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #4 +#endif + asr counterL , tempK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt .Lstrmm_kernel_L4_M4_32 + + KERNEL4x4_I // do one in the K + KERNEL4x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble .Lstrmm_kernel_L4_M4_22a + .align 5 + +.Lstrmm_kernel_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L4_M4_22 + +.Lstrmm_kernel_L4_M4_22a: + + KERNEL4x4_M1 + KERNEL4x4_E + + b .Lstrmm_kernel_L4_M4_44 + +.Lstrmm_kernel_L4_M4_32: + + tst counterL, #1 + ble .Lstrmm_kernel_L4_M4_40 + + KERNEL4x4_I + KERNEL4x4_E + + b .Lstrmm_kernel_L4_M4_44 + +.Lstrmm_kernel_L4_M4_40: + + INIT4x4 + +.Lstrmm_kernel_L4_M4_44: + + ands counterL , tempK, #1 + ble .Lstrmm_kernel_L4_M4_100 + +.Lstrmm_kernel_L4_M4_46: + + KERNEL4x4_SUB + +.Lstrmm_kernel_L4_M4_100: + + SAVE4x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif +.Lstrmm_kernel_L4_M4_END: + +/******************************************************************************/ + +.Lstrmm_kernel_L4_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble .Lstrmm_kernel_L4_END + + tst counterI, #2 // counterI = counterI / 2 + ble .Lstrmm_kernel_L4_M1_BEGIN + +.Lstrmm_kernel_L4_M2_20: + + INIT2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pA, pA, temp + lsl temp, tempOffset, #4 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #4 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lstrmm_kernel_L4_M2_40 + +.Lstrmm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L4_M2_22 + + +.Lstrmm_kernel_L4_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lstrmm_kernel_L4_M2_100 + +.Lstrmm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L4_M2_42 + +.Lstrmm_kernel_L4_M2_100: + + SAVE2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif +.Lstrmm_kernel_L4_M2_END: + +/******************************************************************************/ + +.Lstrmm_kernel_L4_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble .Lstrmm_kernel_L4_END + +.Lstrmm_kernel_L4_M1_20: + + INIT1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + lsl temp, tempOffset, #2 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #4 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lstrmm_kernel_L4_M1_40 + +.Lstrmm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L4_M1_22 + + +.Lstrmm_kernel_L4_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lstrmm_kernel_L4_M1_100 + +.Lstrmm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L4_M1_42 + +.Lstrmm_kernel_L4_M1_100: + + SAVE1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #2 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif +.Lstrmm_kernel_L4_END: + add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 +#if !defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +/******************************************************************************/ +/******************************************************************************/ + +.Lstrmm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble .Lstrmm_kernel_L999 + + tst counterJ , #2 + ble .Lstrmm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #1 + +#if defined(LEFT) + mov tempOffset, offset +#endif + mov pA, origPA // pA = A + +/******************************************************************************/ + +.Lstrmm_kernel_L2_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI,#0 + ble .Lstrmm_kernel_L2_M4_BEGIN + +.Lstrmm_kernel_L2_M8_20: + + INIT8x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pA, pA, temp + lsl temp, tempOffset, #3 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #8 +#else + add tempK, tempOffset, #2 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble .Lstrmm_kernel_L2_M8_40 + .align 5 + +.Lstrmm_kernel_L2_M8_22: + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L2_M8_22 + + +.Lstrmm_kernel_L2_M8_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lstrmm_kernel_L2_M8_100 + +.Lstrmm_kernel_L2_M8_42: + + KERNEL8x2_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L2_M8_42 + +.Lstrmm_kernel_L2_M8_100: + + SAVE8x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #8 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #8 +#endif +.Lstrmm_kernel_L2_M8_END: + + subs counterI, counterI, #1 + bgt .Lstrmm_kernel_L2_M8_20 + +/******************************************************************************/ + +.Lstrmm_kernel_L2_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble .Lstrmm_kernel_L2_END + + tst counterI, #4 + ble .Lstrmm_kernel_L2_M2_BEGIN + +.Lstrmm_kernel_L2_M4_20: + + INIT4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #4 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #2 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble .Lstrmm_kernel_L2_M4_40 + .align 5 + +.Lstrmm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L2_M4_22 + + +.Lstrmm_kernel_L2_M4_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lstrmm_kernel_L2_M4_100 + +.Lstrmm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L2_M4_42 + +.Lstrmm_kernel_L2_M4_100: + + SAVE4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif +.Lstrmm_kernel_L2_M4_END: + +/******************************************************************************/ + +.Lstrmm_kernel_L2_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble .Lstrmm_kernel_L2_END + + tst counterI, #2 // counterI = counterI / 2 + ble .Lstrmm_kernel_L2_M1_BEGIN + +.Lstrmm_kernel_L2_M2_20: + + INIT2x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #2 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble .Lstrmm_kernel_L2_M2_40 + +.Lstrmm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L2_M2_22 + + +.Lstrmm_kernel_L2_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lstrmm_kernel_L2_M2_100 + +.Lstrmm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L2_M2_42 + +.Lstrmm_kernel_L2_M2_100: + + SAVE2x2 +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +.Lstrmm_kernel_L2_M2_END: + +/******************************************************************************/ + +.Lstrmm_kernel_L2_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble .Lstrmm_kernel_L2_END + +.Lstrmm_kernel_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #2 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #2 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble .Lstrmm_kernel_L2_M1_40 + +.Lstrmm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L2_M1_22 + + +.Lstrmm_kernel_L2_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lstrmm_kernel_L2_M1_100 + +.Lstrmm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L2_M1_42 + +.Lstrmm_kernel_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #2 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif +.Lstrmm_kernel_L2_END: +#if !defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 + +/******************************************************************************/ +/******************************************************************************/ + +.Lstrmm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble .Lstrmm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + +#if defined(LEFT) + mov tempOffset, offset +#endif + mov pA, origPA // pA = A + +/******************************************************************************/ + +.Lstrmm_kernel_L1_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 + cmp counterI, #0 + ble .Lstrmm_kernel_L1_M4_BEGIN + +.Lstrmm_kernel_L1_M8_20: + + INIT8x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pA, pA, temp + lsl temp, tempOffset, #2 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #8 +#else + add tempK, tempOffset, #1 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lstrmm_kernel_L1_M8_40 + .align 5 + +.Lstrmm_kernel_L1_M8_22: + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L1_M8_22 + + +.Lstrmm_kernel_L1_M8_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lstrmm_kernel_L1_M8_100 + +.Lstrmm_kernel_L1_M8_42: + + KERNEL8x1_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L1_M8_42 + +.Lstrmm_kernel_L1_M8_100: + + SAVE8x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #8 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #2 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #8 +#endif +.Lstrmm_kernel_L1_M8_END: + + subs counterI, counterI, #1 + bgt .Lstrmm_kernel_L1_M8_20 + +/******************************************************************************/ + +.Lstrmm_kernel_L1_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble .Lstrmm_kernel_L1_END + + tst counterI, #4 + ble .Lstrmm_kernel_L1_M2_BEGIN + +.Lstrmm_kernel_L1_M4_20: + + INIT4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #2 + add pB, pB, temp + lsl temp, tempOffset, #4 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #1 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lstrmm_kernel_L1_M4_40 + .align 5 + +.Lstrmm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L1_M4_22 + + +.Lstrmm_kernel_L1_M4_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lstrmm_kernel_L1_M4_100 + +.Lstrmm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L1_M4_42 + +.Lstrmm_kernel_L1_M4_100: + + SAVE4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #2 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif +.Lstrmm_kernel_L1_M4_END: + +/******************************************************************************/ + +.Lstrmm_kernel_L1_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble .Lstrmm_kernel_L1_END + + tst counterI, #2 // counterI = counterI / 2 + ble .Lstrmm_kernel_L1_M1_BEGIN + +.Lstrmm_kernel_L1_M2_20: + + INIT2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #2 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #1 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lstrmm_kernel_L1_M2_40 + +.Lstrmm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L1_M2_22 + + +.Lstrmm_kernel_L1_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lstrmm_kernel_L1_M2_100 + +.Lstrmm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L1_M2_42 + +.Lstrmm_kernel_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #2 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif +.Lstrmm_kernel_L1_M2_END: + +/******************************************************************************/ + +.Lstrmm_kernel_L1_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble .Lstrmm_kernel_L1_END + +.Lstrmm_kernel_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #2 + add pB, pB, temp + lsl temp, tempOffset, #2 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #1 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lstrmm_kernel_L1_M1_40 + +.Lstrmm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L1_M1_22 + + +.Lstrmm_kernel_L1_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lstrmm_kernel_L1_M1_100 + +.Lstrmm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L1_M1_42 + +.Lstrmm_kernel_L1_M1_100: + + SAVE1x1 + +.Lstrmm_kernel_L1_END: + +/******************************************************************************/ + +.Lstrmm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/common_param.h b/kernel/common_param.h deleted file mode 100644 index 29bb65e5c..000000000 --- a/kernel/common_param.h +++ /dev/null @@ -1,1403 +0,0 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -#ifndef COMMON_PARAM_H -#define COMMON_PARAM_H - -#ifndef ASSEMBLER - -#ifdef DYNAMIC_ARCH - -typedef struct { - int dtb_entries; - int offsetA, offsetB, align; - -#if 1 - int shgemm_p, shgemm_q, shgemm_r; - int shgemm_unroll_m, shgemm_unroll_n, shgemm_unroll_mn; - - float (*shamax_k) (BLASLONG, float *, BLASLONG); - float (*shamin_k) (BLASLONG, float *, BLASLONG); - float (*shmax_k) (BLASLONG, float *, BLASLONG); - float (*shmin_k) (BLASLONG, float *, BLASLONG); -BLASLONG (*ishamax_k)(BLASLONG, float *, BLASLONG); -BLASLONG (*ishamin_k)(BLASLONG, float *, BLASLONG); -BLASLONG (*ishmax_k) (BLASLONG, float *, BLASLONG); -BLASLONG (*ishmin_k) (BLASLONG, float *, BLASLONG); - - float (*shnrm2_k) (BLASLONG, float *, BLASLONG); - float (*shasum_k) (BLASLONG, float *, BLASLONG); - float (*shsum_k) (BLASLONG, float *, BLASLONG); - int (*shcopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - float (*shdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - double (*dshdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - - int (*shrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); - - int (*shaxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - int (*shscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - int (*shswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - - int (*shgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*shgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*shger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - - int (*shsymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*shsymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - - int (*shgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG); - int (*shgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG); - - int (*shgemm_incopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); - int (*shgemm_itcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); - int (*shgemm_oncopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); - int (*shgemm_otcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); - - int (*shtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*shtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*shtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*shtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - - int (*shtrsm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - - int (*shtrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*shtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*shtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*shtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - - int (*shtrmm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - - int (*shsymm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shsymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shsymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shsymm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - - int (*shneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*shlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); - -#endif - int sgemm_p, sgemm_q, sgemm_r; - int sgemm_unroll_m, sgemm_unroll_n, sgemm_unroll_mn; - - int exclusive_cache; - - float (*samax_k) (BLASLONG, float *, BLASLONG); - float (*samin_k) (BLASLONG, float *, BLASLONG); - float (*smax_k) (BLASLONG, float *, BLASLONG); - float (*smin_k) (BLASLONG, float *, BLASLONG); -BLASLONG (*isamax_k)(BLASLONG, float *, BLASLONG); -BLASLONG (*isamin_k)(BLASLONG, float *, BLASLONG); -BLASLONG (*ismax_k) (BLASLONG, float *, BLASLONG); -BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); - - float (*snrm2_k) (BLASLONG, float *, BLASLONG); - float (*sasum_k) (BLASLONG, float *, BLASLONG); - float (*ssum_k) (BLASLONG, float *, BLASLONG); - int (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - float (*sdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - - int (*srot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); - - int (*saxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - int (*sscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - int (*sswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - - int (*sgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*sgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*sger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - - int (*ssymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*ssymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - - int (*sgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); - int (*sgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - - - int (*sgemm_incopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*sgemm_itcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*sgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*sgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); - - int (*strsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*strsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*strsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*strsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - - int (*strsm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - - int (*strmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*strmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*strmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*strmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - - int (*strmm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - - int (*ssymm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ssymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ssymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ssymm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - - int (*sneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*slaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); - - int dgemm_p, dgemm_q, dgemm_r; - int dgemm_unroll_m, dgemm_unroll_n, dgemm_unroll_mn; - - double (*damax_k) (BLASLONG, double *, BLASLONG); - double (*damin_k) (BLASLONG, double *, BLASLONG); - double (*dmax_k) (BLASLONG, double *, BLASLONG); - double (*dmin_k) (BLASLONG, double *, BLASLONG); -BLASLONG (*idamax_k)(BLASLONG, double *, BLASLONG); -BLASLONG (*idamin_k)(BLASLONG, double *, BLASLONG); -BLASLONG (*idmax_k) (BLASLONG, double *, BLASLONG); -BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); - - double (*dnrm2_k) (BLASLONG, double *, BLASLONG); - double (*dasum_k) (BLASLONG, double *, BLASLONG); - double (*dsum_k) (BLASLONG, double *, BLASLONG); - int (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); - double (*ddot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); - int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); - - int (*daxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); - int (*dscal_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); - int (*dswap_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); - - int (*dgemv_n) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*dgemv_t) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*dger_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - - int (*dsymv_L) (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*dsymv_U) (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - - int (*dgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG); - int (*dgemm_beta )(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); - - int (*dgemm_incopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*dgemm_itcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*dgemm_oncopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*dgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); - - int (*dtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*dtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*dtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*dtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); - - int (*dtrsm_iunucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_iunncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_iutucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_iutncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_ilnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_ilnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_iltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_iltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_ounucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_ounncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_outucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_outncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_olnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_olnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_oltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_oltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - - int (*dtrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*dtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*dtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*dtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); - - int (*dtrmm_iunucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_iunncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_iutucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_iutncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_ilnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_ilnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_iltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_iltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_ounucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_ounncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_outucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_outncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_olnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_olnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_oltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_oltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - - int (*dsymm_iutcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dsymm_iltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dsymm_outcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dsymm_oltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - - int (*dneg_tcopy) (BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*dlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, double *, BLASLONG, blasint *, double *); - -#ifdef EXPRECISION - - int qgemm_p, qgemm_q, qgemm_r; - int qgemm_unroll_m, qgemm_unroll_n, qgemm_unroll_mn; - - xdouble (*qamax_k) (BLASLONG, xdouble *, BLASLONG); - xdouble (*qamin_k) (BLASLONG, xdouble *, BLASLONG); - xdouble (*qmax_k) (BLASLONG, xdouble *, BLASLONG); - xdouble (*qmin_k) (BLASLONG, xdouble *, BLASLONG); -BLASLONG (*iqamax_k)(BLASLONG, xdouble *, BLASLONG); -BLASLONG (*iqamin_k)(BLASLONG, xdouble *, BLASLONG); -BLASLONG (*iqmax_k) (BLASLONG, xdouble *, BLASLONG); -BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG); - - xdouble (*qnrm2_k) (BLASLONG, xdouble *, BLASLONG); - xdouble (*qasum_k) (BLASLONG, xdouble *, BLASLONG); - xdouble (*qsum_k) (BLASLONG, xdouble *, BLASLONG); - int (*qcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - xdouble (*qdot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - int (*qrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble); - - int (*qaxpy_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - int (*qscal_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - int (*qswap_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - - int (*qgemv_n) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*qgemv_t) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*qger_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - - int (*qsymv_L) (BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*qsymv_U) (BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - - int (*qgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); - int (*qgemm_beta )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - - int (*qgemm_incopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*qgemm_itcopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*qgemm_oncopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*qgemm_otcopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - - int (*qtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*qtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*qtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*qtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - - int (*qtrsm_iunucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_iunncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_iutucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_iutncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_ilnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_ilnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_iltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_iltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_ounucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_ounncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_outucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_outncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_olnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_olnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_oltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_oltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - - int (*qtrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*qtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*qtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*qtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - - int (*qtrmm_iunucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_iunncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_iutucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_iutncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_ilnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_ilnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_iltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_iltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_ounucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_ounncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_outucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_outncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_olnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_olnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_oltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_oltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - - int (*qsymm_iutcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qsymm_iltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qsymm_outcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qsymm_oltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - - int (*qneg_tcopy) (BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*qlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, xdouble *, BLASLONG, blasint *, xdouble *); - -#endif - - int cgemm_p, cgemm_q, cgemm_r; - int cgemm_unroll_m, cgemm_unroll_n, cgemm_unroll_mn; - - float (*camax_k) (BLASLONG, float *, BLASLONG); - float (*camin_k) (BLASLONG, float *, BLASLONG); -BLASLONG (*icamax_k)(BLASLONG, float *, BLASLONG); -BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); - - float (*cnrm2_k) (BLASLONG, float *, BLASLONG); - float (*casum_k) (BLASLONG, float *, BLASLONG); - float (*csum_k) (BLASLONG, float *, BLASLONG); - int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - int (*csrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); - - int (*caxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - int (*caxpyc_k)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - int (*cscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - int (*cswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - - int (*cgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*cgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*cgemv_r) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*cgemv_c) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*cgemv_o) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*cgemv_u) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*cgemv_s) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*cgemv_d) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*cgeru_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*cgerc_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*cgerv_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*cgerd_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - - int (*csymv_L) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*csymv_U) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*chemv_L) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*chemv_U) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*chemv_M) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*chemv_V) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - - int (*cgemm_kernel_n )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); - int (*cgemm_kernel_l )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); - int (*cgemm_kernel_r )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); - int (*cgemm_kernel_b )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); - int (*cgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - - int (*cgemm_incopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*cgemm_itcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*cgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*cgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); - - int (*ctrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrsm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrsm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrsm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - - int (*ctrsm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - - int (*ctrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrmm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrmm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrmm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrmm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - - int (*ctrmm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - - int (*csymm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*csymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*csymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*csymm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - - int (*chemm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*chemm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*chemm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*chemm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - - int cgemm3m_p, cgemm3m_q, cgemm3m_r; - int cgemm3m_unroll_m, cgemm3m_unroll_n, cgemm3m_unroll_mn; - - int (*cgemm3m_kernel)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); - - int (*cgemm3m_incopyb)(BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*cgemm3m_incopyr)(BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*cgemm3m_incopyi)(BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*cgemm3m_itcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*cgemm3m_itcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*cgemm3m_itcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, float *); - - int (*cgemm3m_oncopyb)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); - int (*cgemm3m_oncopyr)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); - int (*cgemm3m_oncopyi)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); - int (*cgemm3m_otcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); - int (*cgemm3m_otcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); - int (*cgemm3m_otcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); - - int (*csymm3m_iucopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*csymm3m_ilcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*csymm3m_iucopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*csymm3m_ilcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*csymm3m_iucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*csymm3m_ilcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - - int (*csymm3m_oucopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - int (*csymm3m_olcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - int (*csymm3m_oucopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - int (*csymm3m_olcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - int (*csymm3m_oucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - int (*csymm3m_olcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - - int (*chemm3m_iucopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*chemm3m_ilcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*chemm3m_iucopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*chemm3m_ilcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*chemm3m_iucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*chemm3m_ilcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - - int (*chemm3m_oucopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - int (*chemm3m_olcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - int (*chemm3m_oucopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - int (*chemm3m_olcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - int (*chemm3m_oucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - int (*chemm3m_olcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - - int (*cneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*claswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); - - int zgemm_p, zgemm_q, zgemm_r; - int zgemm_unroll_m, zgemm_unroll_n, zgemm_unroll_mn; - - double (*zamax_k) (BLASLONG, double *, BLASLONG); - double (*zamin_k) (BLASLONG, double *, BLASLONG); -BLASLONG (*izamax_k)(BLASLONG, double *, BLASLONG); -BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG); - - double (*znrm2_k) (BLASLONG, double *, BLASLONG); - double (*zasum_k) (BLASLONG, double *, BLASLONG); - double (*zsum_k) (BLASLONG, double *, BLASLONG); - int (*zcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); - openblas_complex_double (*zdotu_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); - openblas_complex_double (*zdotc_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); - int (*zdrot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); - - int (*zaxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); - int (*zaxpyc_k)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); - int (*zscal_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); - int (*zswap_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); - - int (*zgemv_n) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zgemv_t) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zgemv_r) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zgemv_c) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zgemv_o) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zgemv_u) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zgemv_s) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zgemv_d) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zgeru_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zgerc_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zgerv_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zgerd_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - - int (*zsymv_L) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zsymv_U) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zhemv_L) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zhemv_U) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zhemv_M) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zhemv_V) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - - int (*zgemm_kernel_n )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); - int (*zgemm_kernel_l )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); - int (*zgemm_kernel_r )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); - int (*zgemm_kernel_b )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); - int (*zgemm_beta )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); - - int (*zgemm_incopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*zgemm_itcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*zgemm_oncopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*zgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); - - int (*ztrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrsm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrsm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrsm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - - int (*ztrsm_iunucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_iunncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_iutucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_iutncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_ilnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_ilnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_iltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_iltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_ounucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_ounncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_outucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_outncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_olnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_olnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_oltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_oltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - - int (*ztrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrmm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrmm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrmm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrmm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - - int (*ztrmm_iunucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_iunncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_iutucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_iutncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_ilnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_ilnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_iltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_iltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_ounucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_ounncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_outucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_outncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_olnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_olnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_oltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_oltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - - int (*zsymm_iutcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zsymm_iltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zsymm_outcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zsymm_oltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - - int (*zhemm_iutcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zhemm_iltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zhemm_outcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zhemm_oltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - - int zgemm3m_p, zgemm3m_q, zgemm3m_r; - int zgemm3m_unroll_m, zgemm3m_unroll_n, zgemm3m_unroll_mn; - - int (*zgemm3m_kernel)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); - - int (*zgemm3m_incopyb)(BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*zgemm3m_incopyr)(BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*zgemm3m_incopyi)(BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*zgemm3m_itcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*zgemm3m_itcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*zgemm3m_itcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, double *); - - int (*zgemm3m_oncopyb)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); - int (*zgemm3m_oncopyr)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); - int (*zgemm3m_oncopyi)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); - int (*zgemm3m_otcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); - int (*zgemm3m_otcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); - int (*zgemm3m_otcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); - - int (*zsymm3m_iucopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zsymm3m_ilcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zsymm3m_iucopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zsymm3m_ilcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zsymm3m_iucopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zsymm3m_ilcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - - int (*zsymm3m_oucopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - int (*zsymm3m_olcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - int (*zsymm3m_oucopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - int (*zsymm3m_olcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - int (*zsymm3m_oucopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - int (*zsymm3m_olcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - - int (*zhemm3m_iucopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zhemm3m_ilcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zhemm3m_iucopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zhemm3m_ilcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zhemm3m_iucopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zhemm3m_ilcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - - int (*zhemm3m_oucopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - int (*zhemm3m_olcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - int (*zhemm3m_oucopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - int (*zhemm3m_olcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - int (*zhemm3m_oucopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - int (*zhemm3m_olcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - - int (*zneg_tcopy) (BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*zlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, double *, BLASLONG, blasint *, double *); - -#ifdef EXPRECISION - - int xgemm_p, xgemm_q, xgemm_r; - int xgemm_unroll_m, xgemm_unroll_n, xgemm_unroll_mn; - - xdouble (*xamax_k) (BLASLONG, xdouble *, BLASLONG); - xdouble (*xamin_k) (BLASLONG, xdouble *, BLASLONG); -BLASLONG (*ixamax_k)(BLASLONG, xdouble *, BLASLONG); -BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); - - xdouble (*xnrm2_k) (BLASLONG, xdouble *, BLASLONG); - xdouble (*xasum_k) (BLASLONG, xdouble *, BLASLONG); - xdouble (*xsum_k) (BLASLONG, xdouble *, BLASLONG); - int (*xcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - openblas_complex_xdouble (*xdotu_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - openblas_complex_xdouble (*xdotc_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - int (*xqrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble); - - int (*xaxpy_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - int (*xaxpyc_k)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - int (*xscal_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - int (*xswap_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - - int (*xgemv_n) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemv_t) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemv_r) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemv_c) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemv_o) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemv_u) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemv_s) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemv_d) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgeru_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgerc_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgerv_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgerd_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - - int (*xsymv_L) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xsymv_U) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xhemv_L) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xhemv_U) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xhemv_M) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xhemv_V) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - - int (*xgemm_kernel_n )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); - int (*xgemm_kernel_l )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); - int (*xgemm_kernel_r )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); - int (*xgemm_kernel_b )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); - int (*xgemm_beta )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - - int (*xgemm_incopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemm_itcopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemm_oncopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemm_otcopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - - int (*xtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrsm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrsm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrsm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - - int (*xtrsm_iunucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_iunncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_iutucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_iutncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_ilnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_ilnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_iltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_iltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_ounucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_ounncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_outucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_outncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_olnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_olnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_oltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_oltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - - int (*xtrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrmm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrmm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrmm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrmm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - - int (*xtrmm_iunucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_iunncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_iutucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_iutncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_ilnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_ilnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_iltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_iltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_ounucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_ounncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_outucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_outncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_olnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_olnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_oltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_oltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - - int (*xsymm_iutcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xsymm_iltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xsymm_outcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xsymm_oltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - - int (*xhemm_iutcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xhemm_iltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xhemm_outcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xhemm_oltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - - int xgemm3m_p, xgemm3m_q, xgemm3m_r; - int xgemm3m_unroll_m, xgemm3m_unroll_n, xgemm3m_unroll_mn; - - int (*xgemm3m_kernel)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); - - int (*xgemm3m_incopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemm3m_incopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemm3m_incopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemm3m_itcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemm3m_itcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemm3m_itcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - - int (*xgemm3m_oncopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); - int (*xgemm3m_oncopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); - int (*xgemm3m_oncopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); - int (*xgemm3m_otcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); - int (*xgemm3m_otcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); - int (*xgemm3m_otcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); - - int (*xsymm3m_iucopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xsymm3m_ilcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xsymm3m_iucopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xsymm3m_ilcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xsymm3m_iucopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xsymm3m_ilcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - - int (*xsymm3m_oucopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - int (*xsymm3m_olcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - int (*xsymm3m_oucopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - int (*xsymm3m_olcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - int (*xsymm3m_oucopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - int (*xsymm3m_olcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - - int (*xhemm3m_iucopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xhemm3m_ilcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xhemm3m_iucopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xhemm3m_ilcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xhemm3m_iucopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xhemm3m_ilcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - - int (*xhemm3m_oucopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - int (*xhemm3m_olcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - int (*xhemm3m_oucopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - int (*xhemm3m_olcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - int (*xhemm3m_oucopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - int (*xhemm3m_olcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - - int (*xneg_tcopy) (BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, xdouble *, BLASLONG, blasint *, xdouble *); - -#endif - - - void (*init)(void); - - int snum_opt, dnum_opt, qnum_opt; - - int (*saxpby_k) (BLASLONG, float, float*, BLASLONG,float, float*, BLASLONG); - int (*daxpby_k) (BLASLONG, double, double*, BLASLONG,double, double*, BLASLONG); - int (*caxpby_k) (BLASLONG, float, float, float*, BLASLONG,float,float, float*, BLASLONG); - int (*zaxpby_k) (BLASLONG, double, double, double*, BLASLONG,double,double, double*, BLASLONG); - - int (*somatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); - int (*somatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); - int (*somatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); - int (*somatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); - - int (*domatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); - int (*domatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); - int (*domatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); - int (*domatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); - - int (*comatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); - int (*comatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); - int (*comatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); - int (*comatcopy_k_rt) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); - - int (*comatcopy_k_cnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); - int (*comatcopy_k_ctc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); - int (*comatcopy_k_rnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); - int (*comatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); - - int (*zomatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); - int (*zomatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); - int (*zomatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); - int (*zomatcopy_k_rt) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); - - int (*zomatcopy_k_cnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); - int (*zomatcopy_k_ctc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); - int (*zomatcopy_k_rnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); - int (*zomatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); - - int (*simatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG); - int (*simatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG); - int (*simatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG); - int (*simatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG); - - int (*dimatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG); - int (*dimatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG); - int (*dimatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG); - int (*dimatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG); - - int (*cimatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); - int (*cimatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); - int (*cimatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); - int (*cimatcopy_k_rt) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); - - int (*cimatcopy_k_cnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); - int (*cimatcopy_k_ctc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); - int (*cimatcopy_k_rnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); - int (*cimatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); - - int (*zimatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); - int (*zimatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); - int (*zimatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); - int (*zimatcopy_k_rt) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); - - int (*zimatcopy_k_cnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); - int (*zimatcopy_k_ctc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); - int (*zimatcopy_k_rnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); - int (*zimatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); - - int (*sgeadd_k) (BLASLONG, BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG); - int (*dgeadd_k) (BLASLONG, BLASLONG, double, double *, BLASLONG, double, double *, BLASLONG); - int (*cgeadd_k) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float, float, float *, BLASLONG); - int (*zgeadd_k) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double, double, double *, BLASLONG); - -} gotoblas_t; - -extern gotoblas_t *gotoblas; - -#define DTB_ENTRIES gotoblas -> dtb_entries -#define GEMM_OFFSET_A gotoblas -> offsetA -#define GEMM_OFFSET_B gotoblas -> offsetB -#define GEMM_ALIGN gotoblas -> align - -#define HAVE_EX_L2 gotoblas -> exclusive_cache - -#define SHGEMM_P gotoblas -> shgemm_p -#define SHGEMM_Q gotoblas -> shgemm_q -#define SHGEMM_R gotoblas -> shgemm_r -#define SHGEMM_UNROLL_M gotoblas -> shgemm_unroll_m -#define SHGEMM_UNROLL_N gotoblas -> shgemm_unroll_n -#define SHGEMM_UNROLL_MN gotoblas -> shgemm_unroll_mn - -#define SGEMM_P gotoblas -> sgemm_p -#define SGEMM_Q gotoblas -> sgemm_q -#define SGEMM_R gotoblas -> sgemm_r -#define SGEMM_UNROLL_M gotoblas -> sgemm_unroll_m -#define SGEMM_UNROLL_N gotoblas -> sgemm_unroll_n -#define SGEMM_UNROLL_MN gotoblas -> sgemm_unroll_mn - -#define DGEMM_P gotoblas -> dgemm_p -#define DGEMM_Q gotoblas -> dgemm_q -#define DGEMM_R gotoblas -> dgemm_r -#define DGEMM_UNROLL_M gotoblas -> dgemm_unroll_m -#define DGEMM_UNROLL_N gotoblas -> dgemm_unroll_n -#define DGEMM_UNROLL_MN gotoblas -> dgemm_unroll_mn - -#define QGEMM_P gotoblas -> qgemm_p -#define QGEMM_Q gotoblas -> qgemm_q -#define QGEMM_R gotoblas -> qgemm_r -#define QGEMM_UNROLL_M gotoblas -> qgemm_unroll_m -#define QGEMM_UNROLL_N gotoblas -> qgemm_unroll_n -#define QGEMM_UNROLL_MN gotoblas -> qgemm_unroll_mn - -#define CGEMM_P gotoblas -> cgemm_p -#define CGEMM_Q gotoblas -> cgemm_q -#define CGEMM_R gotoblas -> cgemm_r -#define CGEMM_UNROLL_M gotoblas -> cgemm_unroll_m -#define CGEMM_UNROLL_N gotoblas -> cgemm_unroll_n -#define CGEMM_UNROLL_MN gotoblas -> cgemm_unroll_mn - -#define ZGEMM_P gotoblas -> zgemm_p -#define ZGEMM_Q gotoblas -> zgemm_q -#define ZGEMM_R gotoblas -> zgemm_r -#define ZGEMM_UNROLL_M gotoblas -> zgemm_unroll_m -#define ZGEMM_UNROLL_N gotoblas -> zgemm_unroll_n -#define ZGEMM_UNROLL_MN gotoblas -> zgemm_unroll_mn - -#define XGEMM_P gotoblas -> xgemm_p -#define XGEMM_Q gotoblas -> xgemm_q -#define XGEMM_R gotoblas -> xgemm_r -#define XGEMM_UNROLL_M gotoblas -> xgemm_unroll_m -#define XGEMM_UNROLL_N gotoblas -> xgemm_unroll_n -#define XGEMM_UNROLL_MN gotoblas -> xgemm_unroll_mn - -#define CGEMM3M_P gotoblas -> cgemm3m_p -#define CGEMM3M_Q gotoblas -> cgemm3m_q -#define CGEMM3M_R gotoblas -> cgemm3m_r -#define CGEMM3M_UNROLL_M gotoblas -> cgemm3m_unroll_m -#define CGEMM3M_UNROLL_N gotoblas -> cgemm3m_unroll_n -#define CGEMM3M_UNROLL_MN gotoblas -> cgemm3m_unroll_mn - -#define ZGEMM3M_P gotoblas -> zgemm3m_p -#define ZGEMM3M_Q gotoblas -> zgemm3m_q -#define ZGEMM3M_R gotoblas -> zgemm3m_r -#define ZGEMM3M_UNROLL_M gotoblas -> zgemm3m_unroll_m -#define ZGEMM3M_UNROLL_N gotoblas -> zgemm3m_unroll_n -#define ZGEMM3M_UNROLL_MN gotoblas -> zgemm3m_unroll_mn - -#define XGEMM3M_P gotoblas -> xgemm3m_p -#define XGEMM3M_Q gotoblas -> xgemm3m_q -#define XGEMM3M_R gotoblas -> xgemm3m_r -#define XGEMM3M_UNROLL_M gotoblas -> xgemm3m_unroll_m -#define XGEMM3M_UNROLL_N gotoblas -> xgemm3m_unroll_n -#define XGEMM3M_UNROLL_MN gotoblas -> xgemm3m_unroll_mn - -#else - -#define DTB_ENTRIES DTB_DEFAULT_ENTRIES - -#define GEMM_OFFSET_A GEMM_DEFAULT_OFFSET_A -#define GEMM_OFFSET_B GEMM_DEFAULT_OFFSET_B -#define GEMM_ALIGN GEMM_DEFAULT_ALIGN - -#ifdef HAVE_EXCLUSIVE_CACHE -#define HAVE_EX_L2 1 -#else -#define HAVE_EX_L2 0 -#endif - -#define SHGEMM_P SHGEMM_DEFAULT_P -#define SHGEMM_Q SHGEMM_DEFAULT_Q -#define SHGEMM_R SHGEMM_DEFAULT_R -#define SHGEMM_UNROLL_M SHGEMM_DEFAULT_UNROLL_M -#define SHGEMM_UNROLL_N SHGEMM_DEFAULT_UNROLL_N -#ifdef SHGEMM_DEFAULT_UNROLL_MN -#define SHGEMM_UNROLL_MN SHGEMM_DEFAULT_UNROLL_MN -#else -#define SHGEMM_UNROLL_MN MAX((SHGEMM_UNROLL_M), (SHGEMM_UNROLL_N)) -#endif - -#define SGEMM_P SGEMM_DEFAULT_P -#define SGEMM_Q SGEMM_DEFAULT_Q -#define SGEMM_R SGEMM_DEFAULT_R -#define SGEMM_UNROLL_M SGEMM_DEFAULT_UNROLL_M -#define SGEMM_UNROLL_N SGEMM_DEFAULT_UNROLL_N -#ifdef SGEMM_DEFAULT_UNROLL_MN -#define SGEMM_UNROLL_MN SGEMM_DEFAULT_UNROLL_MN -#else -#define SGEMM_UNROLL_MN MAX((SGEMM_UNROLL_M), (SGEMM_UNROLL_N)) -#endif - -#define DGEMM_P DGEMM_DEFAULT_P -#define DGEMM_Q DGEMM_DEFAULT_Q -#define DGEMM_R DGEMM_DEFAULT_R -#define DGEMM_UNROLL_M DGEMM_DEFAULT_UNROLL_M -#define DGEMM_UNROLL_N DGEMM_DEFAULT_UNROLL_N -#ifdef DGEMM_DEFAULT_UNROLL_MN -#define DGEMM_UNROLL_MN DGEMM_DEFAULT_UNROLL_MN -#else -#define DGEMM_UNROLL_MN MAX((DGEMM_UNROLL_M), (DGEMM_UNROLL_N)) -#endif - -#define QGEMM_P QGEMM_DEFAULT_P -#define QGEMM_Q QGEMM_DEFAULT_Q -#define QGEMM_R QGEMM_DEFAULT_R -#define QGEMM_UNROLL_M QGEMM_DEFAULT_UNROLL_M -#define QGEMM_UNROLL_N QGEMM_DEFAULT_UNROLL_N -#define QGEMM_UNROLL_MN MAX((QGEMM_UNROLL_M), (QGEMM_UNROLL_N)) - -#define CGEMM_P CGEMM_DEFAULT_P -#define CGEMM_Q CGEMM_DEFAULT_Q -#define CGEMM_R CGEMM_DEFAULT_R -#define CGEMM_UNROLL_M CGEMM_DEFAULT_UNROLL_M -#define CGEMM_UNROLL_N CGEMM_DEFAULT_UNROLL_N -#ifdef CGEMM_DEFAULT_UNROLL_MN -#define CGEMM_UNROLL_MN CGEMM_DEFAULT_UNROLL_MN -#else -#define CGEMM_UNROLL_MN MAX((CGEMM_UNROLL_M), (CGEMM_UNROLL_N)) -#endif - -#define ZGEMM_P ZGEMM_DEFAULT_P -#define ZGEMM_Q ZGEMM_DEFAULT_Q -#define ZGEMM_R ZGEMM_DEFAULT_R -#define ZGEMM_UNROLL_M ZGEMM_DEFAULT_UNROLL_M -#define ZGEMM_UNROLL_N ZGEMM_DEFAULT_UNROLL_N -#ifdef ZGEMM_DEFAULT_UNROLL_MN -#define ZGEMM_UNROLL_MN ZGEMM_DEFAULT_UNROLL_MN -#else -#define ZGEMM_UNROLL_MN MAX((ZGEMM_UNROLL_M), (ZGEMM_UNROLL_N)) -#endif - -#define XGEMM_P XGEMM_DEFAULT_P -#define XGEMM_Q XGEMM_DEFAULT_Q -#define XGEMM_R XGEMM_DEFAULT_R -#define XGEMM_UNROLL_M XGEMM_DEFAULT_UNROLL_M -#define XGEMM_UNROLL_N XGEMM_DEFAULT_UNROLL_N -#define XGEMM_UNROLL_MN MAX((XGEMM_UNROLL_M), (XGEMM_UNROLL_N)) - -#ifdef CGEMM3M_DEFAULT_UNROLL_N - -#define CGEMM3M_P CGEMM3M_DEFAULT_P -#define CGEMM3M_Q CGEMM3M_DEFAULT_Q -#define CGEMM3M_R CGEMM3M_DEFAULT_R -#define CGEMM3M_UNROLL_M CGEMM3M_DEFAULT_UNROLL_M -#define CGEMM3M_UNROLL_N CGEMM3M_DEFAULT_UNROLL_N -#define CGEMM3M_UNROLL_MN MAX((CGEMM3M_UNROLL_M), (CGEMM3M_UNROLL_N)) - -#else - -#define CGEMM3M_P SGEMM_DEFAULT_P -#define CGEMM3M_Q SGEMM_DEFAULT_Q -#define CGEMM3M_R SGEMM_DEFAULT_R -#define CGEMM3M_UNROLL_M SGEMM_DEFAULT_UNROLL_M -#define CGEMM3M_UNROLL_N SGEMM_DEFAULT_UNROLL_N -#define CGEMM3M_UNROLL_MN MAX((CGEMM_UNROLL_M), (CGEMM_UNROLL_N)) - -#endif - - -#ifdef ZGEMM3M_DEFAULT_UNROLL_N - -#define ZGEMM3M_P ZGEMM3M_DEFAULT_P -#define ZGEMM3M_Q ZGEMM3M_DEFAULT_Q -#define ZGEMM3M_R ZGEMM3M_DEFAULT_R -#define ZGEMM3M_UNROLL_M ZGEMM3M_DEFAULT_UNROLL_M -#define ZGEMM3M_UNROLL_N ZGEMM3M_DEFAULT_UNROLL_N -#define ZGEMM3M_UNROLL_MN MAX((ZGEMM_UNROLL_M), (ZGEMM_UNROLL_N)) - -#else - -#define ZGEMM3M_P DGEMM_DEFAULT_P -#define ZGEMM3M_Q DGEMM_DEFAULT_Q -#define ZGEMM3M_R DGEMM_DEFAULT_R -#define ZGEMM3M_UNROLL_M DGEMM_DEFAULT_UNROLL_M -#define ZGEMM3M_UNROLL_N DGEMM_DEFAULT_UNROLL_N -#define ZGEMM3M_UNROLL_MN MAX((ZGEMM_UNROLL_M), (ZGEMM_UNROLL_N)) - -#endif - -#define XGEMM3M_P QGEMM_DEFAULT_P -#define XGEMM3M_Q QGEMM_DEFAULT_Q -#define XGEMM3M_R QGEMM_DEFAULT_R -#define XGEMM3M_UNROLL_M QGEMM_DEFAULT_UNROLL_M -#define XGEMM3M_UNROLL_N QGEMM_DEFAULT_UNROLL_N -#define XGEMM3M_UNROLL_MN MAX((QGEMM_UNROLL_M), (QGEMM_UNROLL_N)) - - -#endif -#endif - -#ifndef COMPLEX -#if defined(XDOUBLE) -#define GEMM_P QGEMM_P -#define GEMM_Q QGEMM_Q -#define GEMM_R QGEMM_R -#define GEMM_UNROLL_M QGEMM_UNROLL_M -#define GEMM_UNROLL_N QGEMM_UNROLL_N -#define GEMM_UNROLL_MN QGEMM_UNROLL_MN -#define GEMM_DEFAULT_P QGEMM_DEFAULT_P -#define GEMM_DEFAULT_Q QGEMM_DEFAULT_Q -#define GEMM_DEFAULT_R QGEMM_DEFAULT_R -#define GEMM_DEFAULT_UNROLL_M QGEMM_DEFAULT_UNROLL_M -#define GEMM_DEFAULT_UNROLL_N QGEMM_DEFAULT_UNROLL_N -#elif defined(DOUBLE) -#define GEMM_P DGEMM_P -#define GEMM_Q DGEMM_Q -#define GEMM_R DGEMM_R -#define GEMM_UNROLL_M DGEMM_UNROLL_M -#define GEMM_UNROLL_N DGEMM_UNROLL_N -#define GEMM_UNROLL_MN DGEMM_UNROLL_MN -#define GEMM_DEFAULT_P DGEMM_DEFAULT_P -#define GEMM_DEFAULT_Q DGEMM_DEFAULT_Q -#define GEMM_DEFAULT_R DGEMM_DEFAULT_R -#define GEMM_DEFAULT_UNROLL_M DGEMM_DEFAULT_UNROLL_M -#define GEMM_DEFAULT_UNROLL_N DGEMM_DEFAULT_UNROLL_N -#elif defined(HALF) -#define GEMM_P SHGEMM_P -#define GEMM_Q SHGEMM_Q -#define GEMM_R SHGEMM_R -#define GEMM_UNROLL_M SHGEMM_UNROLL_M -#define GEMM_UNROLL_N SHGEMM_UNROLL_N -#define GEMM_UNROLL_MN SHGEMM_UNROLL_MN -#define GEMM_DEFAULT_P SHGEMM_DEFAULT_P -#define GEMM_DEFAULT_Q SHGEMM_DEFAULT_Q -#define GEMM_DEFAULT_R SHGEMM_DEFAULT_R -#define GEMM_DEFAULT_UNROLL_M SHGEMM_DEFAULT_UNROLL_M -#define GEMM_DEFAULT_UNROLL_N SHGEMM_DEFAULT_UNROLL_N -#else -#define GEMM_P SGEMM_P -#define GEMM_Q SGEMM_Q -#define GEMM_R SGEMM_R -#define GEMM_UNROLL_M SGEMM_UNROLL_M -#define GEMM_UNROLL_N SGEMM_UNROLL_N -#define GEMM_UNROLL_MN SGEMM_UNROLL_MN -#define GEMM_DEFAULT_P SGEMM_DEFAULT_P -#define GEMM_DEFAULT_Q SGEMM_DEFAULT_Q -#define GEMM_DEFAULT_R SGEMM_DEFAULT_R -#define GEMM_DEFAULT_UNROLL_M SGEMM_DEFAULT_UNROLL_M -#define GEMM_DEFAULT_UNROLL_N SGEMM_DEFAULT_UNROLL_N -#endif -#else -#if defined(XDOUBLE) -#define GEMM_P XGEMM_P -#define GEMM_Q XGEMM_Q -#define GEMM_R XGEMM_R -#define GEMM_UNROLL_M XGEMM_UNROLL_M -#define GEMM_UNROLL_N XGEMM_UNROLL_N -#define GEMM_UNROLL_MN XGEMM_UNROLL_MN -#define GEMM_DEFAULT_P XGEMM_DEFAULT_P -#define GEMM_DEFAULT_Q XGEMM_DEFAULT_Q -#define GEMM_DEFAULT_R XGEMM_DEFAULT_R -#define GEMM_DEFAULT_UNROLL_M XGEMM_DEFAULT_UNROLL_M -#define GEMM_DEFAULT_UNROLL_N XGEMM_DEFAULT_UNROLL_N -#elif defined(DOUBLE) -#define GEMM_P ZGEMM_P -#define GEMM_Q ZGEMM_Q -#define GEMM_R ZGEMM_R -#define GEMM_UNROLL_M ZGEMM_UNROLL_M -#define GEMM_UNROLL_N ZGEMM_UNROLL_N -#define GEMM_UNROLL_MN ZGEMM_UNROLL_MN -#define GEMM_DEFAULT_P ZGEMM_DEFAULT_P -#define GEMM_DEFAULT_Q ZGEMM_DEFAULT_Q -#define GEMM_DEFAULT_R ZGEMM_DEFAULT_R -#define GEMM_DEFAULT_UNROLL_M ZGEMM_DEFAULT_UNROLL_M -#define GEMM_DEFAULT_UNROLL_N ZGEMM_DEFAULT_UNROLL_N -#else -#define GEMM_P CGEMM_P -#define GEMM_Q CGEMM_Q -#define GEMM_R CGEMM_R -#define GEMM_UNROLL_M CGEMM_UNROLL_M -#define GEMM_UNROLL_N CGEMM_UNROLL_N -#define GEMM_UNROLL_MN CGEMM_UNROLL_MN -#define GEMM_DEFAULT_P CGEMM_DEFAULT_P -#define GEMM_DEFAULT_Q CGEMM_DEFAULT_Q -#define GEMM_DEFAULT_R CGEMM_DEFAULT_R -#define GEMM_DEFAULT_UNROLL_M CGEMM_DEFAULT_UNROLL_M -#define GEMM_DEFAULT_UNROLL_N CGEMM_DEFAULT_UNROLL_N -#endif -#endif - -#ifdef XDOUBLE -#define GEMM3M_UNROLL_M XGEMM3M_UNROLL_M -#define GEMM3M_UNROLL_N XGEMM3M_UNROLL_N -#elif defined(DOUBLE) -#define GEMM3M_UNROLL_M ZGEMM3M_UNROLL_M -#define GEMM3M_UNROLL_N ZGEMM3M_UNROLL_N -#else -#define GEMM3M_UNROLL_M CGEMM3M_UNROLL_M -#define GEMM3M_UNROLL_N CGEMM3M_UNROLL_N -#endif - - -#ifndef QGEMM_DEFAULT_UNROLL_M -#define QGEMM_DEFAULT_UNROLL_M 2 -#endif - -#ifndef QGEMM_DEFAULT_UNROLL_N -#define QGEMM_DEFAULT_UNROLL_N 2 -#endif - -#ifndef XGEMM_DEFAULT_UNROLL_M -#define XGEMM_DEFAULT_UNROLL_M 2 -#endif - -#ifndef XGEMM_DEFAULT_UNROLL_N -#define XGEMM_DEFAULT_UNROLL_N 2 -#endif - -#ifndef GEMM_THREAD -#define GEMM_THREAD gemm_thread_n -#endif - -#ifndef SHGEMM_DEFAULT_R -#define SHGEMM_DEFAULT_R (((BUFFER_SIZE - ((SHGEMM_DEFAULT_P * SHGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SHGEMM_DEFAULT_Q * 4) - 15) & ~15UL) -#endif - -#ifndef SGEMM_DEFAULT_R -#define SGEMM_DEFAULT_R (((BUFFER_SIZE - ((SGEMM_DEFAULT_P * SGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SGEMM_DEFAULT_Q * 4) - 15) & ~15UL) -#endif - -#ifndef DGEMM_DEFAULT_R -#define DGEMM_DEFAULT_R (((BUFFER_SIZE - ((DGEMM_DEFAULT_P * DGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (DGEMM_DEFAULT_Q * 8) - 15) & ~15UL) -#endif - -#ifndef QGEMM_DEFAULT_R -#define QGEMM_DEFAULT_R (((BUFFER_SIZE - ((QGEMM_DEFAULT_P * QGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (QGEMM_DEFAULT_Q * 16) - 15) & ~15UL) -#endif - -#ifndef CGEMM_DEFAULT_R -#define CGEMM_DEFAULT_R (((BUFFER_SIZE - ((CGEMM_DEFAULT_P * CGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (CGEMM_DEFAULT_Q * 8) - 15) & ~15UL) -#endif - -#ifndef ZGEMM_DEFAULT_R -#define ZGEMM_DEFAULT_R (((BUFFER_SIZE - ((ZGEMM_DEFAULT_P * ZGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (ZGEMM_DEFAULT_Q * 16) - 15) & ~15UL) -#endif - -#ifndef XGEMM_DEFAULT_R -#define XGEMM_DEFAULT_R (((BUFFER_SIZE - ((XGEMM_DEFAULT_P * XGEMM_DEFAULT_Q * 32 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (XGEMM_DEFAULT_Q * 32) - 15) & ~15UL) -#endif - -#ifndef SNUMOPT -#define SNUMOPT 2 -#endif - -#ifndef DNUMOPT -#define DNUMOPT 2 -#endif - -#ifndef QNUMOPT -#define QNUMOPT 1 -#endif - -#ifndef GEMM3M_P -#ifdef XDOUBLE -#define GEMM3M_P XGEMM3M_P -#elif defined(DOUBLE) -#define GEMM3M_P ZGEMM3M_P -#else -#define GEMM3M_P CGEMM3M_P -#endif -#endif - -#ifndef GEMM3M_Q -#ifdef XDOUBLE -#define GEMM3M_Q XGEMM3M_Q -#elif defined(DOUBLE) -#define GEMM3M_Q ZGEMM3M_Q -#else -#define GEMM3M_Q CGEMM3M_Q -#endif -#endif - -#ifndef GEMM3M_R -#ifdef XDOUBLE -#define GEMM3M_R XGEMM3M_R -#elif defined(DOUBLE) -#define GEMM3M_R ZGEMM3M_R -#else -#define GEMM3M_R CGEMM3M_R -#endif -#endif - - -#endif diff --git a/kernel/generic/gemm_ncopy_16.c b/kernel/generic/gemm_ncopy_16.c index 5f91d0dbe..d3ab46472 100644 --- a/kernel/generic/gemm_ncopy_16.c +++ b/kernel/generic/gemm_ncopy_16.c @@ -39,24 +39,24 @@ #include #include "common.h" -int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ BLASLONG i, j; - FLOAT *aoffset; - FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; - FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; - FLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12; - FLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16; + IFLOAT *aoffset; + IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; + IFLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12; + IFLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16; - FLOAT *boffset; - FLOAT ctemp01, ctemp02, ctemp03, ctemp04; - FLOAT ctemp05, ctemp06, ctemp07, ctemp08; - FLOAT ctemp09, ctemp10, ctemp11, ctemp12; - FLOAT ctemp13, ctemp14, ctemp15, ctemp16; - FLOAT ctemp17, ctemp18, ctemp19, ctemp20; - FLOAT ctemp21, ctemp22, ctemp23, ctemp24; - FLOAT ctemp25, ctemp26, ctemp27, ctemp28; - FLOAT ctemp29, ctemp30, ctemp31, ctemp32; + IFLOAT *boffset; + IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; + IFLOAT ctemp05, ctemp06, ctemp07, ctemp08; + IFLOAT ctemp09, ctemp10, ctemp11, ctemp12; + IFLOAT ctemp13, ctemp14, ctemp15, ctemp16; + IFLOAT ctemp17, ctemp18, ctemp19, ctemp20; + IFLOAT ctemp21, ctemp22, ctemp23, ctemp24; + IFLOAT ctemp25, ctemp26, ctemp27, ctemp28; + IFLOAT ctemp29, ctemp30, ctemp31, ctemp32; aoffset = a; boffset = b; diff --git a/kernel/generic/gemm_ncopy_8.c b/kernel/generic/gemm_ncopy_8.c index a49a778e6..aaf9c8917 100644 --- a/kernel/generic/gemm_ncopy_8.c +++ b/kernel/generic/gemm_ncopy_8.c @@ -39,30 +39,30 @@ #include #include "common.h" -int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ BLASLONG i, j; - FLOAT *aoffset; - FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; - FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; + IFLOAT *aoffset; + IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; - FLOAT *boffset; - FLOAT ctemp01, ctemp02, ctemp03, ctemp04; - FLOAT ctemp05, ctemp06, ctemp07, ctemp08; - FLOAT ctemp09, ctemp10, ctemp11, ctemp12; - FLOAT ctemp13, ctemp14, ctemp15, ctemp16; - FLOAT ctemp17, ctemp18, ctemp19, ctemp20; - FLOAT ctemp21, ctemp22, ctemp23, ctemp24; - FLOAT ctemp25, ctemp26, ctemp27, ctemp28; - FLOAT ctemp29, ctemp30, ctemp31, ctemp32; - FLOAT ctemp33, ctemp34, ctemp35, ctemp36; - FLOAT ctemp37, ctemp38, ctemp39, ctemp40; - FLOAT ctemp41, ctemp42, ctemp43, ctemp44; - FLOAT ctemp45, ctemp46, ctemp47, ctemp48; - FLOAT ctemp49, ctemp50, ctemp51, ctemp52; - FLOAT ctemp53, ctemp54, ctemp55, ctemp56; - FLOAT ctemp57, ctemp58, ctemp59, ctemp60; - FLOAT ctemp61, ctemp62, ctemp63, ctemp64; + IFLOAT *boffset; + IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; + IFLOAT ctemp05, ctemp06, ctemp07, ctemp08; + IFLOAT ctemp09, ctemp10, ctemp11, ctemp12; + IFLOAT ctemp13, ctemp14, ctemp15, ctemp16; + IFLOAT ctemp17, ctemp18, ctemp19, ctemp20; + IFLOAT ctemp21, ctemp22, ctemp23, ctemp24; + IFLOAT ctemp25, ctemp26, ctemp27, ctemp28; + IFLOAT ctemp29, ctemp30, ctemp31, ctemp32; + IFLOAT ctemp33, ctemp34, ctemp35, ctemp36; + IFLOAT ctemp37, ctemp38, ctemp39, ctemp40; + IFLOAT ctemp41, ctemp42, ctemp43, ctemp44; + IFLOAT ctemp45, ctemp46, ctemp47, ctemp48; + IFLOAT ctemp49, ctemp50, ctemp51, ctemp52; + IFLOAT ctemp53, ctemp54, ctemp55, ctemp56; + IFLOAT ctemp57, ctemp58, ctemp59, ctemp60; + IFLOAT ctemp61, ctemp62, ctemp63, ctemp64; aoffset = a; diff --git a/kernel/generic/gemm_tcopy_16.c b/kernel/generic/gemm_tcopy_16.c index 56268ebf2..14252599a 100644 --- a/kernel/generic/gemm_tcopy_16.c +++ b/kernel/generic/gemm_tcopy_16.c @@ -39,22 +39,22 @@ #include #include "common.h" -int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ BLASLONG i, j; - FLOAT *aoffset; - FLOAT *aoffset1, *aoffset2; - FLOAT *boffset; + IFLOAT *aoffset; + IFLOAT *aoffset1, *aoffset2; + IFLOAT *boffset; - FLOAT ctemp01, ctemp02, ctemp03, ctemp04; - FLOAT ctemp05, ctemp06, ctemp07, ctemp08; - FLOAT ctemp09, ctemp10, ctemp11, ctemp12; - FLOAT ctemp13, ctemp14, ctemp15, ctemp16; - FLOAT ctemp17, ctemp18, ctemp19, ctemp20; - FLOAT ctemp21, ctemp22, ctemp23, ctemp24; - FLOAT ctemp25, ctemp26, ctemp27, ctemp28; - FLOAT ctemp29, ctemp30, ctemp31, ctemp32; + IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; + IFLOAT ctemp05, ctemp06, ctemp07, ctemp08; + IFLOAT ctemp09, ctemp10, ctemp11, ctemp12; + IFLOAT ctemp13, ctemp14, ctemp15, ctemp16; + IFLOAT ctemp17, ctemp18, ctemp19, ctemp20; + IFLOAT ctemp21, ctemp22, ctemp23, ctemp24; + IFLOAT ctemp25, ctemp26, ctemp27, ctemp28; + IFLOAT ctemp29, ctemp30, ctemp31, ctemp32; aoffset = a; boffset = b; diff --git a/kernel/generic/gemm_tcopy_8.c b/kernel/generic/gemm_tcopy_8.c index b28f3d219..3e8a839db 100644 --- a/kernel/generic/gemm_tcopy_8.c +++ b/kernel/generic/gemm_tcopy_8.c @@ -39,32 +39,32 @@ #include #include "common.h" -int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ BLASLONG i, j; - FLOAT *aoffset; - FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; - FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; + IFLOAT *aoffset; + IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; - FLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4; + IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4; - FLOAT ctemp01, ctemp02, ctemp03, ctemp04; - FLOAT ctemp05, ctemp06, ctemp07, ctemp08; - FLOAT ctemp09, ctemp10, ctemp11, ctemp12; - FLOAT ctemp13, ctemp14, ctemp15, ctemp16; - FLOAT ctemp17, ctemp18, ctemp19, ctemp20; - FLOAT ctemp21, ctemp22, ctemp23, ctemp24; - FLOAT ctemp25, ctemp26, ctemp27, ctemp28; - FLOAT ctemp29, ctemp30, ctemp31, ctemp32; - FLOAT ctemp33, ctemp34, ctemp35, ctemp36; - FLOAT ctemp37, ctemp38, ctemp39, ctemp40; - FLOAT ctemp41, ctemp42, ctemp43, ctemp44; - FLOAT ctemp45, ctemp46, ctemp47, ctemp48; - FLOAT ctemp49, ctemp50, ctemp51, ctemp52; - FLOAT ctemp53, ctemp54, ctemp55, ctemp56; - FLOAT ctemp57, ctemp58, ctemp59, ctemp60; - FLOAT ctemp61, ctemp62, ctemp63, ctemp64; + IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; + IFLOAT ctemp05, ctemp06, ctemp07, ctemp08; + IFLOAT ctemp09, ctemp10, ctemp11, ctemp12; + IFLOAT ctemp13, ctemp14, ctemp15, ctemp16; + IFLOAT ctemp17, ctemp18, ctemp19, ctemp20; + IFLOAT ctemp21, ctemp22, ctemp23, ctemp24; + IFLOAT ctemp25, ctemp26, ctemp27, ctemp28; + IFLOAT ctemp29, ctemp30, ctemp31, ctemp32; + IFLOAT ctemp33, ctemp34, ctemp35, ctemp36; + IFLOAT ctemp37, ctemp38, ctemp39, ctemp40; + IFLOAT ctemp41, ctemp42, ctemp43, ctemp44; + IFLOAT ctemp45, ctemp46, ctemp47, ctemp48; + IFLOAT ctemp49, ctemp50, ctemp51, ctemp52; + IFLOAT ctemp53, ctemp54, ctemp55, ctemp56; + IFLOAT ctemp57, ctemp58, ctemp59, ctemp60; + IFLOAT ctemp61, ctemp62, ctemp63, ctemp64; aoffset = a; boffset = b; diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 new file mode 100644 index 000000000..f390fac61 --- /dev/null +++ b/kernel/power/KERNEL.POWER10 @@ -0,0 +1,225 @@ +ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) +include $(KERNELDIR)/KERNEL.POWER8 +else + +#SGEMM_BETA = ../generic/gemm_beta.c +#DGEMM_BETA = ../generic/gemm_beta.c +#CGEMM_BETA = ../generic/zgemm_beta.c +#ZGEMM_BETA = ../generic/zgemm_beta.c + +SHGEMM_BETA = ../generic/gemm_beta.c +SHGEMMKERNEL = shgemm_kernel_power10.c +SHGEMMINCOPY = ../generic/gemm_ncopy_16.c +SHGEMMITCOPY = ../generic/gemm_tcopy_16.c +SHGEMMONCOPY = ../generic/gemm_ncopy_8.c +SHGEMMOTCOPY = ../generic/gemm_tcopy_8.c +SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX) +SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX) +SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX) +SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRMMKERNEL = sgemm_kernel_power10.c +DTRMMKERNEL = dgemm_kernel_power10.c +CTRMMKERNEL = cgemm_kernel_power10.S +ZTRMMKERNEL = zgemm_kernel_power10.S + +SGEMMKERNEL = sgemm_kernel_power10.c +SGEMMINCOPY = ../generic/gemm_ncopy_16.c +SGEMMITCOPY = sgemm_tcopy_16_power8.S +SGEMMONCOPY = ../generic/gemm_ncopy_8.c +SGEMMOTCOPY = sgemm_tcopy_8_power8.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = dgemm_kernel_power10.c +DGEMMINCOPY = ../generic/gemm_ncopy_16.c +DGEMMITCOPY = dgemm_tcopy_16_power8.S +DGEMMONCOPY = dgemm_ncopy_4_power8.S +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = cgemm_kernel_power10.S +CGEMMINCOPY = ../generic/zgemm_ncopy_8.c +CGEMMITCOPY = ../generic/zgemm_tcopy_8.c +CGEMMONCOPY = ../generic/zgemm_ncopy_4.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = zgemm_kernel_power10.S +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c +ZGEMMITCOPY = zgemm_tcopy_8_power8.S +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +#Todo: CGEMM3MKERNEL should be 4x4 blocksizes. +#CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse3.S +#ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse3.S + +#Pure C for other kernels +#SAMAXKERNEL = ../arm/amax.c +#DAMAXKERNEL = ../arm/amax.c +#CAMAXKERNEL = ../arm/zamax.c +#ZAMAXKERNEL = ../arm/zamax.c +# +#SAMINKERNEL = ../arm/amin.c +#DAMINKERNEL = ../arm/amin.c +#CAMINKERNEL = ../arm/zamin.c +#ZAMINKERNEL = ../arm/zamin.c +# +#SMAXKERNEL = ../arm/max.c +#DMAXKERNEL = ../arm/max.c +# +#SMINKERNEL = ../arm/min.c +#DMINKERNEL = ../arm/min.c +# +ifneq ($(GCCVERSIONGTEQ9),1) +ISAMAXKERNEL = isamax_power9.S +else +ISAMAXKERNEL = isamax.c +endif +IDAMAXKERNEL = idamax.c +ifneq ($(GCCVERSIONGTEQ9),1) +ICAMAXKERNEL = icamax_power9.S +else +ICAMAXKERNEL = icamax.c +endif +IZAMAXKERNEL = izamax.c +# +ifneq ($(GCCVERSIONGTEQ9),1) +ISAMINKERNEL = isamin_power9.S +else +ISAMINKERNEL = isamin.c +endif +IDAMINKERNEL = idamin.c +ifneq ($(GCCVERSIONGTEQ9),1) +ICAMINKERNEL = icamin_power9.S +else +ICAMINKERNEL = icamin.c +endif +IZAMINKERNEL = izamin.c +# +#ISMAXKERNEL = ../arm/imax.c +#IDMAXKERNEL = ../arm/imax.c +# +#ISMINKERNEL = ../arm/imin.c +#IDMINKERNEL = ../arm/imin.c +# +SASUMKERNEL = sasum.c +DASUMKERNEL = dasum.c +CASUMKERNEL = casum.c +ZASUMKERNEL = zasum.c +# +SAXPYKERNEL = saxpy.c +DAXPYKERNEL = daxpy.c +ifneq ($(GCCVERSIONGTEQ9),1) +CAXPYKERNEL = caxpy_power9.S +else +CAXPYKERNEL = caxpy.c +endif +ZAXPYKERNEL = zaxpy.c +# +SCOPYKERNEL = scopy.c +DCOPYKERNEL = dcopy.c +CCOPYKERNEL = ccopy.c +ZCOPYKERNEL = zcopy.c +# +SDOTKERNEL = sdot.c +DDOTKERNEL = ddot.c +DSDOTKERNEL = sdot.c +ifneq ($(GCCVERSIONGTEQ9),1) +CDOTKERNEL = cdot_power9.S +else +CDOTKERNEL = cdot.c +endif +ZDOTKERNEL = zdot.c +# +SNRM2KERNEL = ../arm/nrm2.c +DNRM2KERNEL = ../arm/nrm2.c +CNRM2KERNEL = ../arm/znrm2.c +ZNRM2KERNEL = ../arm/znrm2.c +# +SROTKERNEL = srot.c +DROTKERNEL = drot.c +CROTKERNEL = crot.c +ZROTKERNEL = zrot.c +# +SSCALKERNEL = sscal.c +DSCALKERNEL = dscal.c +CSCALKERNEL = zscal.c +ZSCALKERNEL = zscal.c +# +SSWAPKERNEL = sswap.c +DSWAPKERNEL = dswap.c +CSWAPKERNEL = cswap.c +ZSWAPKERNEL = zswap.c +# + +SGEMVNKERNEL = sgemv_n.c +DGEMVNKERNEL = dgemv_n_power10.c +CGEMVNKERNEL = cgemv_n.c +ZGEMVNKERNEL = zgemv_n_4.c +# +SGEMVTKERNEL = sgemv_t.c +DGEMVTKERNEL = dgemv_t_power10.c +CGEMVTKERNEL = cgemv_t.c +ZGEMVTKERNEL = zgemv_t_4.c + + +#SSYMV_U_KERNEL = ../generic/symv_k.c +#SSYMV_L_KERNEL = ../generic/symv_k.c +#DSYMV_U_KERNEL = ../generic/symv_k.c +#DSYMV_L_KERNEL = ../generic/symv_k.c +#QSYMV_U_KERNEL = ../generic/symv_k.c +#QSYMV_L_KERNEL = ../generic/symv_k.c +#CSYMV_U_KERNEL = ../generic/zsymv_k.c +#CSYMV_L_KERNEL = ../generic/zsymv_k.c +#ZSYMV_U_KERNEL = ../generic/zsymv_k.c +#ZSYMV_L_KERNEL = ../generic/zsymv_k.c +#XSYMV_U_KERNEL = ../generic/zsymv_k.c +#XSYMV_L_KERNEL = ../generic/zsymv_k.c + +#ZHEMV_U_KERNEL = ../generic/zhemv_k.c +#ZHEMV_L_KERNEL = ../generic/zhemv_k.c + +LSAME_KERNEL = ../generic/lsame.c +SCABS_KERNEL = ../generic/cabs.c +DCABS_KERNEL = ../generic/cabs.c +QCABS_KERNEL = ../generic/cabs.c + +#Dump kernel +CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c +ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c + +endif diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index c7867012b..c2f4cd204 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -1,3 +1,44 @@ +# Big-endian 32bit (AIX) is supported through the POWER6 GEMM kernels, no separate TRMM +ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1) +SGEMMKERNEL = gemm_kernel_power6.S +SGEMMINCOPY = +SGEMMITCOPY = +SGEMMONCOPY = gemm_ncopy_4.S +SGEMMOTCOPY = gemm_tcopy_4.S +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_power6.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = gemm_ncopy_4.S +DGEMMOTCOPY = gemm_tcopy_4.S +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_power6.S +CGEMMINCOPY = ../generic/zgemm_ncopy_2.c +CGEMMITCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_4.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_power6.S +ZGEMMINCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMITCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +else + #SGEMM_BETA = ../generic/gemm_beta.c #DGEMM_BETA = ../generic/gemm_beta.c #CGEMM_BETA = ../generic/zgemm_beta.c @@ -12,7 +53,7 @@ SGEMMKERNEL = sgemm_kernel_16x8_power8.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = sgemm_tcopy_16_power8.S SGEMMONCOPY = ../generic/gemm_ncopy_8.c -SGEMMOTCOPY = sgemm_tcopy_8_power8.S +SGEMMOTCOPY = sgemm_tcopy_8_power8.S SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) @@ -47,16 +88,24 @@ ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1) +DTRSMKERNEL_LN = trsm_kernel_power6_LN.S +DTRSMKERNEL_LT = trsm_kernel_power6_LT.S +DTRSMKERNEL_RN = trsm_kernel_power6_LT.S +DTRSMKERNEL_RT = trsm_kernel_power6_RT.S +else DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c @@ -153,6 +202,10 @@ ZASUMKERNEL = zasum.c # SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c +# +ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1) +CAXPYKERNEL = zaxpy.S +else ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) ifneq ($(GCCVERSIONGTEQ9),1) CAXPYKERNEL = caxpy_power8.S @@ -162,6 +215,7 @@ endif else CAXPYKERNEL = caxpy.c endif +endif # ZAXPYKERNEL = zaxpy.c # @@ -232,3 +286,10 @@ QCABS_KERNEL = ../generic/cabs.c #Dump kernel CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c + +ifeq ($(__BYTE_ORDER__)$(ELF_VERSION),__ORDER_BIG_ENDIAN__2) +IDAMAXKERNEL = ../arm/iamax.c +IDAMINKERNEL = ../arm/iamin.c +IZAMAXKERNEL = ../arm/izamax.c +IZAMINKERNEL = ../arm/izamin.c +endif diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9 index aabb5d976..ab8fbfcd9 100644 --- a/kernel/power/KERNEL.POWER9 +++ b/kernel/power/KERNEL.POWER9 @@ -16,7 +16,7 @@ SGEMMKERNEL = sgemm_kernel_power9.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = sgemm_tcopy_16_power8.S SGEMMONCOPY = ../generic/gemm_ncopy_8.c -SGEMMOTCOPY = sgemm_tcopy_8_power8.S +SGEMMOTCOPY = sgemm_tcopy_8_power8.S SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/power/KERNEL.PPCG4 b/kernel/power/KERNEL.PPCG4 index f615754bb..54660b54d 100644 --- a/kernel/power/KERNEL.PPCG4 +++ b/kernel/power/KERNEL.PPCG4 @@ -20,8 +20,10 @@ ZAXPYKERNEL = zaxpy_ppc440.S SDOTKERNEL = dot_ppc440.S DDOTKERNEL = dot_ppc440.S -CDOTKERNEL = zdot_ppc440.S -ZDOTKERNEL = zdot_ppc440.S +#CDOTKERNEL = zdot_ppc440.S +#ZDOTKERNEL = zdot_ppc440.S +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c ISAMAXKERNEL = iamax_ppc440.S IDAMAXKERNEL = iamax_ppc440.S @@ -52,8 +54,11 @@ ZNRM2KERNEL = znrm2_ppc440.S SROTKERNEL = rot_ppc440.S DROTKERNEL = rot_ppc440.S -CROTKERNEL = zrot_ppc440.S -ZROTKERNEL = zrot_ppc440.S +#CROTKERNEL = zrot_ppc440.S +#ZROTKERNEL = zrot_ppc440.S +CROTKERNEL = ../arm/zrot.c +ZROTKERNEL = ../arm/zrot.c + SSCALKERNEL = scal_ppc440.S DSCALKERNEL = scal_ppc440.S @@ -78,13 +83,18 @@ DGEMMINCOPYOBJ = DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) -CGEMMKERNEL = zgemm_kernel_altivec_g4.S -CGEMMINCOPY = ../generic/zgemm_ncopy_8.c -CGEMMITCOPY = ../generic/zgemm_tcopy_8.c +#CGEMMKERNEL = zgemm_kernel_altivec_g4.S +#CGEMMINCOPY = ../generic/zgemm_ncopy_8.c +#CGEMMITCOPY = ../generic/zgemm_tcopy_8.c +CGEMMKERNEL = zgemm_kernel.S +CGEMMINCOPY = +CGEMMONCOPY = CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) -CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMINCOPYOBJ = +#cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = +#cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_g4.S diff --git a/kernel/power/casum.c b/kernel/power/casum.c index a9ece0768..06982bfba 100644 --- a/kernel/power/casum.c +++ b/kernel/power/casum.c @@ -46,9 +46,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "casum_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_16 diff --git a/kernel/power/ccopy.c b/kernel/power/ccopy.c index 50df84cc5..5e58034dd 100644 --- a/kernel/power/ccopy.c +++ b/kernel/power/ccopy.c @@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "ccopy_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_32 diff --git a/kernel/power/cdot.c b/kernel/power/cdot.c index d5b18729a..ef5e4710f 100644 --- a/kernel/power/cdot.c +++ b/kernel/power/cdot.c @@ -23,6 +23,9 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +#if !defined(__VEC__) || !defined(__ALTIVEC__) +#include "../arm/zdot.c" +#else #include "common.h" #ifndef HAVE_KERNEL_8 @@ -168,3 +171,4 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA return (result); } +#endif diff --git a/kernel/power/cgemm_kernel_8x4_power8.S b/kernel/power/cgemm_kernel_8x4_power8.S index 2bc99974f..6be8c128c 100644 --- a/kernel/power/cgemm_kernel_8x4_power8.S +++ b/kernel/power/cgemm_kernel_8x4_power8.S @@ -424,7 +424,7 @@ L999: lwz r16, 204(SP) lwz r15, 208(SP) lwz r14, 212(SP) - addi r11, 224 + addi r11, SP, 224 #endif lvx v20, r11, r0 addi r11, r11, 16 @@ -459,4 +459,4 @@ L999: blr EPILOGUE -#endif^ +#endif diff --git a/kernel/power/cgemm_kernel_power10.S b/kernel/power/cgemm_kernel_power10.S new file mode 100644 index 000000000..e04f948dd --- /dev/null +++ b/kernel/power/cgemm_kernel_power10.S @@ -0,0 +1,286 @@ +/*************************************************************************** +Copyright (c) 2013-2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + + +#define LOAD ld +#define STACKSIZE (512 ) +#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ +#define M r3 +#define N r4 +#define K r5 + + +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 + + +#define alpha_r vs51 +#define alpha_i vs55 +#define save_permute_1 vs59 +#define permute_mask vs63 +#define o0 0 + + +#define T1 r11 +#define T2 r12 +#define T3 r14 +#define T4 r15 +#define T5 r16 +#define T6 r17 +#define L r18 +#define T7 r19 +#define T8 r20 +#define TEMP_REG r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define T9 r27 +#define T10 r28 +#define PRE r29 + +#define T12 r30 +#define T13 r31 + +#include "cgemm_macros_power10.S" + +.equ perm_const1, 0x0405060700010203 +.equ perm_const2, 0x0c0d0e0f08090a0b +.equ save_permute_12, 0x0c0d0e0f1c1d1e1f +.equ save_permute_11, 0x0405060714151617 + + + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + + addi SP, SP, -STACKSIZE + mflr r0 + + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + + stxv vs52, 288(SP) + stxv vs53, 304(SP) + stxv vs54, 320(SP) + stxv vs55, 336(SP) + stxv vs56, 352(SP) + stxv vs57, 368(SP) + stxv vs58, 384(SP) + stxv vs59, 400(SP) + stxv vs60, 416(SP) + stxv vs61, 432(SP) + stxv vs62, 448(SP) + stxv vs63, 464(SP) + std r0, FLINK_SAVE(SP) + + + + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) + + + +#ifdef TRMMKERNEL + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#endif + slwi LDC, LDC, ZBASE_SHIFT + + + + /*alpha is stored in f1. convert to single and splat*/ + xscvdpspn alpha_r,vs1 + xscvdpspn alpha_i,vs2 + xxspltw alpha_r,alpha_r,0 + xxspltw alpha_i,alpha_i,0 +/*load reverse permute mask for big endian + uint128 = 0xc0d0e0f08090a0b0405060700010203 +*/ + + lis T2, perm_const2@highest + lis T1, perm_const1@highest + lis T3, save_permute_12@highest + lis T4, save_permute_11@highest + + + ori T2, T2, perm_const2@higher + ori T1, T1, perm_const1@higher + ori T3, T3, save_permute_12@higher + ori T4, T4, save_permute_11@higher + + + rldicr T2, T2, 32, 31 + rldicr T1, T1, 32, 31 + rldicr T3, T3, 32, 31 + rldicr T4, T4, 32, 31 + + oris T2, T2, perm_const2@h + oris T1, T1, perm_const1@h + oris T3, T3, save_permute_12@h + oris T4, T4, save_permute_11@h + + + ori T2, T2, perm_const2@l + ori T1, T1, perm_const1@l + ori T3, T3, save_permute_12@l + ori T4, T4, save_permute_11@l + + + li r0,0 + li PRE,512 + +#if defined(CC) || defined(CR) || defined(RC) || defined(RR) +/*negate for this case as we will use addition -1*(a+b) */ + xvnegsp alpha_r,alpha_r + xvnegsp alpha_i,alpha_i +#endif + + mtvsrdd permute_mask,T2,T1 + mtvsrdd save_permute_1,T3,T4 + + /*mask is reverse permute so we have to make it inner permute */ + xxpermdi permute_mask, permute_mask, permute_mask,2 + +#include "cgemm_logic_power10.S" + +.L999: + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + ld r0, FLINK_SAVE(SP) + + lxv vs52, 288(SP) + lxv vs53, 304(SP) + lxv vs54, 320(SP) + lxv vs55, 336(SP) + lxv vs56, 352(SP) + lxv vs57, 368(SP) + lxv vs58, 384(SP) + lxv vs59, 400(SP) + mtlr r0 + lxv vs60, 416(SP) + lxv vs61, 432(SP) + lxv vs62, 448(SP) + lxv vs63, 464(SP) + + addi SP, SP, STACKSIZE + blr + + + EPILOGUE +#endif diff --git a/kernel/power/cgemm_logic_power10.S b/kernel/power/cgemm_logic_power10.S new file mode 100644 index 000000000..3700ac87b --- /dev/null +++ b/kernel/power/cgemm_logic_power10.S @@ -0,0 +1,2814 @@ +/*************************************************************************** +Copyright (c) 2013-2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define MY_ALIGN .align 3 +b CGEMM_L4 +/* MINI SUBROUTINES */ +/* 4x8 MAIN 128x+2 LOOP */ + + +CGEMM_L4x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD4x8_2 + MY_ALIGN +CGEMM_L4x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL4x8_L2 128,64,0,0 +CGEMM_L4x8_K128: +/*----------------------------------------*/ + KERNEL4x8_L2 128,64,1,0 + dcbt AO, T2 + KERNEL4x8_L2 128,64,2,0 + KERNEL4x8_L2 128,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL4x8_L2 128,64,4,0 + KERNEL4x8_L2 128,64,5,0 + dcbt AO, T4 + KERNEL4x8_L2 128,64,6,0 + KERNEL4x8_L2 128,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL4x8_L2 128,64,8,0 + KERNEL4x8_L2 128,64,9,0 + KERNEL4x8_L2 128,64,10,0 + KERNEL4x8_L2 128,64,11,0 + dcbt BO, T4 + KERNEL4x8_L2 128,64,12,0 + KERNEL4x8_L2 128,64,13,0 + KERNEL4x8_L2 128,64,14,0 + KERNEL4x8_L2 128,64,15,0 + KERNEL4x8_L2 128,64,16,0 + KERNEL4x8_L2 128,64,17,0 + KERNEL4x8_L2 128,64,18,0 + KERNEL4x8_L2 128,64,19,0 + KERNEL4x8_L2 128,64,20,0 + KERNEL4x8_L2 128,64,21,0 + KERNEL4x8_L2 128,64,22,0 + KERNEL4x8_L2 128,64,23,0 + KERNEL4x8_L2 128,64,24,0 + KERNEL4x8_L2 128,64,25,0 + KERNEL4x8_L2 128,64,26,0 + KERNEL4x8_L2 128,64,27,0 + KERNEL4x8_L2 128,64,28,0 + KERNEL4x8_L2 128,64,29,0 + KERNEL4x8_L2 128,64,30,0 + KERNEL4x8_L2 128,64,31,0 + KERNEL4x8_L2 128,64,32,0 + KERNEL4x8_L2 128,64,33,0 + KERNEL4x8_L2 128,64,34,0 + KERNEL4x8_L2 128,64,35,0 + KERNEL4x8_L2 128,64,36,0 + KERNEL4x8_L2 128,64,37,0 + KERNEL4x8_L2 128,64,38,0 + KERNEL4x8_L2 128,64,39,0 + KERNEL4x8_L2 128,64,40,0 + KERNEL4x8_L2 128,64,41,0 + KERNEL4x8_L2 128,64,42,0 + KERNEL4x8_L2 128,64,43,0 + KERNEL4x8_L2 128,64,44,0 + KERNEL4x8_L2 128,64,45,0 + KERNEL4x8_L2 128,64,46,0 + KERNEL4x8_L2 128,64,47,0 + KERNEL4x8_L2 128,64,48,0 + KERNEL4x8_L2 128,64,49,0 + KERNEL4x8_L2 128,64,50,0 + KERNEL4x8_L2 128,64,51,0 + KERNEL4x8_L2 128,64,52,0 + KERNEL4x8_L2 128,64,53,0 + KERNEL4x8_L2 128,64,54,0 + KERNEL4x8_L2 128,64,55,0 + KERNEL4x8_L2 128,64,56,0 + KERNEL4x8_L2 128,64,57,0 + KERNEL4x8_L2 128,64,58,0 + KERNEL4x8_L2 128,64,59,0 + KERNEL4x8_L2 128,64,60,0 + KERNEL4x8_L2 128,64,61,0 + KERNEL4x8_L2 128,64,62,0 + KERNEL4x8_L2 128,64,63,1 + bdnz CGEMM_L4x8_LOOP + MY_ALIGN +CGEMM_L4x8_LOOP_END: +/*----------------------------------------*/ + END4x8_2 + blr + MY_ALIGN + + +CGEMM_4x8_L64_SUB: +/*----------------------------------------*/ + LOAD4x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL4x8_L2 128,64,0,0 + KERNEL4x8_L2 128,64,1,0 + dcbt AO, T2 + KERNEL4x8_L2 128,64,2,0 + KERNEL4x8_L2 128,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL4x8_L2 128,64,4,0 + KERNEL4x8_L2 128,64,5,0 + dcbt AO, T4 + KERNEL4x8_L2 128,64,6,0 + KERNEL4x8_L2 128,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL4x8_L2 128,64,8,0 + KERNEL4x8_L2 128,64,9,0 + KERNEL4x8_L2 128,64,10,0 + KERNEL4x8_L2 128,64,11,0 + dcbt BO, T4 + KERNEL4x8_L2 128,64,12,0 + KERNEL4x8_L2 128,64,13,0 + KERNEL4x8_L2 128,64,14,0 + KERNEL4x8_L2 128,64,15,0 + KERNEL4x8_L2 128,64,16,0 + KERNEL4x8_L2 128,64,17,0 + KERNEL4x8_L2 128,64,18,0 + KERNEL4x8_L2 128,64,19,0 + KERNEL4x8_L2 128,64,20,0 + KERNEL4x8_L2 128,64,21,0 + KERNEL4x8_L2 128,64,22,0 + KERNEL4x8_L2 128,64,23,0 + KERNEL4x8_L2 128,64,24,0 + KERNEL4x8_L2 128,64,25,0 + KERNEL4x8_L2 128,64,26,0 + KERNEL4x8_L2 128,64,27,0 + KERNEL4x8_L2 128,64,28,0 + KERNEL4x8_L2 128,64,29,0 + KERNEL4x8_L2 128,64,30,0 + KERNEL4x8_E2 128,64,31,1 + blr + MY_ALIGN + + +CGEMM_4x8_L32_SUB: +/*----------------------------------------*/ + LOAD4x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL4x8_L2 128,64,0,0 + KERNEL4x8_L2 128,64,1,0 + dcbt AO, T2 + KERNEL4x8_L2 128,64,2,0 + KERNEL4x8_L2 128,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL4x8_L2 128,64,4,0 + KERNEL4x8_L2 128,64,5,0 + dcbt AO, T4 + KERNEL4x8_L2 128,64,6,0 + KERNEL4x8_L2 128,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL4x8_L2 128,64,8,0 + KERNEL4x8_L2 128,64,9,0 + KERNEL4x8_L2 128,64,10,0 + KERNEL4x8_L2 128,64,11,0 + dcbt BO, T4 + KERNEL4x8_L2 128,64,12,0 + KERNEL4x8_L2 128,64,13,0 + KERNEL4x8_L2 128,64,14,0 + KERNEL4x8_E2 128,64,15,1 + blr + MY_ALIGN + + +CGEMM_4x8_L16_SUB: +/*----------------------------------------*/ + LOAD4x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL4x8_L2 128,64,0,0 + KERNEL4x8_L2 128,64,1,0 + dcbt AO, T2 + KERNEL4x8_L2 128,64,2,0 + KERNEL4x8_L2 128,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL4x8_L2 128,64,4,0 + KERNEL4x8_L2 128,64,5,0 + dcbt AO, T4 + KERNEL4x8_L2 128,64,6,0 + KERNEL4x8_E2 128,64,7,1 + blr + MY_ALIGN + + +CGEMM_4x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD4x4_2 + MY_ALIGN +CGEMM_L4x4_LOOP: +/*----------------------------------------*/ + KERNEL4x4_L2 64,64,0,0 +CGEMM_L4x4_K32: +/*----------------------------------------*/ + KERNEL4x4_L2 64,64,1,0 + KERNEL4x4_L2 64,64,2,0 + KERNEL4x4_L2 64,64,3,0 + KERNEL4x4_L2 64,64,4,0 + KERNEL4x4_L2 64,64,5,0 + KERNEL4x4_L2 64,64,6,0 + KERNEL4x4_L2 64,64,7,0 + KERNEL4x4_L2 64,64,8,0 + KERNEL4x4_L2 64,64,9,0 + KERNEL4x4_L2 64,64,10,0 + KERNEL4x4_L2 64,64,11,0 + KERNEL4x4_L2 64,64,12,0 + KERNEL4x4_L2 64,64,13,0 + KERNEL4x4_L2 64,64,14,0 + KERNEL4x4_L2 64,64,15,1 + bdnz CGEMM_L4x4_LOOP + MY_ALIGN +CGEMM_L4x4_LOOP_END: +/*----------------------------------------*/ + END4x4_2 + blr + MY_ALIGN + + +CGEMM_4x4_L16_SUB: +/*----------------------------------------*/ + LOAD4x4_2 + KERNEL4x4_L2 64,64,0,0 + KERNEL4x4_L2 64,64,1,0 + KERNEL4x4_L2 64,64,2,0 + KERNEL4x4_L2 64,64,3,0 + KERNEL4x4_L2 64,64,4,0 + KERNEL4x4_L2 64,64,5,0 + KERNEL4x4_L2 64,64,6,0 + KERNEL4x4_E2 64,64,7,1 + blr + MY_ALIGN + + +CGEMM_4x4_L8_SUB: +/*----------------------------------------*/ + LOAD4x4_2 + KERNEL4x4_L2 64,64,0,0 + KERNEL4x4_L2 64,64,1,0 + KERNEL4x4_L2 64,64,2,0 + KERNEL4x4_E2 64,64,3,1 + blr + + +CGEMM_4x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD4x2_2 + MY_ALIGN +CGEMM_L4x2_LOOP: +/*----------------------------------------*/ + KERNEL4x2_L2 32,64,0,0 +CGEMM_L4x2_K32: +/*----------------------------------------*/ + KERNEL4x2_L2 32,64,1,0 + KERNEL4x2_L2 32,64,2,0 + KERNEL4x2_L2 32,64,3,0 + KERNEL4x2_L2 32,64,4,0 + KERNEL4x2_L2 32,64,5,0 + KERNEL4x2_L2 32,64,6,0 + KERNEL4x2_L2 32,64,7,0 + KERNEL4x2_L2 32,64,8,0 + KERNEL4x2_L2 32,64,9,0 + KERNEL4x2_L2 32,64,10,0 + KERNEL4x2_L2 32,64,11,0 + KERNEL4x2_L2 32,64,12,0 + KERNEL4x2_L2 32,64,13,0 + KERNEL4x2_L2 32,64,14,0 + KERNEL4x2_L2 32,64,15,1 + bdnz CGEMM_L4x2_LOOP + MY_ALIGN + + +CGEMM_L4x2_LOOP_END: +/*----------------------------------------*/ + END4x2_2 + blr + MY_ALIGN +CGEMM_4x2_L16_SUB: +/*----------------------------------------*/ + LOAD4x2_2 + KERNEL4x2_L2 32,64,0,0 + KERNEL4x2_L2 32,64,1,0 + KERNEL4x2_L2 32,64,2,0 + KERNEL4x2_L2 32,64,3,0 + KERNEL4x2_L2 32,64,4,0 + KERNEL4x2_L2 32,64,5,0 + KERNEL4x2_L2 32,64,6,0 + KERNEL4x2_E2 32,64,7,1 + blr + MY_ALIGN +CGEMM_4x2_L8_SUB: +/*----------------------------------------*/ + LOAD4x2_2 + KERNEL4x2_L2 32,64,0,0 + KERNEL4x2_L2 32,64,1,0 + KERNEL4x2_L2 32,64,2,0 + KERNEL4x2_E2 32,64,3,1 + blr + + +CGEMM_4x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD4x1_2 + MY_ALIGN +CGEMM_L4x1_LOOP: +/*----------------------------------------*/ + KERNEL4x1_L2 16,64,0,0 +CGEMM_L4x1_K32: +/*----------------------------------------*/ + KERNEL4x1_L2 16,64,1,0 + KERNEL4x1_L2 16,64,2,0 + KERNEL4x1_L2 16,64,3,0 + KERNEL4x1_L2 16,64,4,0 + KERNEL4x1_L2 16,64,5,0 + KERNEL4x1_L2 16,64,6,0 + KERNEL4x1_L2 16,64,7,0 + KERNEL4x1_L2 16,64,8,0 + KERNEL4x1_L2 16,64,9,0 + KERNEL4x1_L2 16,64,10,0 + KERNEL4x1_L2 16,64,11,0 + KERNEL4x1_L2 16,64,12,0 + KERNEL4x1_L2 16,64,13,0 + KERNEL4x1_L2 16,64,14,0 + KERNEL4x1_L2 16,64,15,1 + bdnz CGEMM_L4x1_LOOP + MY_ALIGN +CGEMM_L4x1_LOOP_END: +/*----------------------------------------*/ + END4x1_2 + blr + + MY_ALIGN +CGEMM_4x1_L16_SUB: +/*----------------------------------------*/ + LOAD4x1_2 + KERNEL4x1_L2 16,64,0,0 + KERNEL4x1_L2 16,64,1,0 + KERNEL4x1_L2 16,64,2,0 + KERNEL4x1_L2 16,64,3,0 + KERNEL4x1_L2 16,64,4,0 + KERNEL4x1_L2 16,64,5,0 + KERNEL4x1_L2 16,64,6,0 + KERNEL4x1_E2 16,64,7,1 + blr + MY_ALIGN + + +CGEMM_4x1_L8_SUB: +/*----------------------------------------*/ + LOAD4x1_2 + KERNEL4x1_L2 16,64,0,0 + KERNEL4x1_L2 16,64,1,0 + KERNEL4x1_L2 16,64,2,0 + KERNEL4x1_E2 16,64,3,1 + blr + + + +/* MAIN LOOP BEGINS */ + MY_ALIGN + + +CGEMM_L4: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) && !defined(LEFT) + neg TEMP_REG, OFFSET +#endif + /* Pre set value in vs57 as 0xffff0000ffff0000 for masking */ + vspltisb v24, -1 + vspltisb v25, 0 + xxsldwi vs57, vs56, vs57, 1 + xxpermdi vs57, vs57, vs57, 3 + srawi. J, N, 2 + ble CGEMM_L4_END + + +CGEMM_L4_BEGIN: +/*----------------------------------------*/ + mr CO, C + slwi T1, LDC , 2 + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble CGEMM_L4x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +CGEMM_L4x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,8,4 + mr T1, T6 +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T1-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + ZERO4x8 + ble CGEMM_L4x8_SUB0 + bl CGEMM_L4x8_LMAIN_SUB + andi. L, T1, 127 + ble CGEMM_L4x8_SAVE + b CGEMM_L4x8_SUB2 + + +CGEMM_L4x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T8,1 + bne CMP4x8_128K + addi BO,BO,-32 + addi AO,AO,-64 + LOAD4x8O 64,32 + END4x8_WITHOUT_ADD + LOAD4x8_2O 128, 64 + mtctr T8 + bl CGEMM_L4x8_K128 + b CGEMM_L4x8_SAVE + CMP4x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,128 +#else + cmpwi K,128 +#endif + bne CGEMM_L4x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-128 + LOAD4x8_2O 128,64 + bl CGEMM_L4x8_K128 + b CGEMM_L4x8_SAVE + MY_ALIGN + + +CGEMM_L4x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble CGEMM_L4x8_SUB2_32 + bl CGEMM_4x8_L64_SUB + MY_ALIGN + + +CGEMM_L4x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble CGEMM_L4x8_SUB2_16 + bl CGEMM_4x8_L32_SUB + MY_ALIGN + + +CGEMM_L4x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L4x8_SUB2_8 + bl CGEMM_4x8_L16_SUB + MY_ALIGN + + +CGEMM_L4x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L4x8_SUB2_4 + LOAD4x8_2 + KERNEL4x8_L2 128,64, 0,0 + KERNEL4x8_L2 128,64, 1,0 + KERNEL4x8_L2 128,64, 2,0 + KERNEL4x8_E2 128,64, 3,1 + MY_ALIGN + + +CGEMM_L4x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L4x8_SUB2_2 + LOAD4x8_2 + KERNEL4x8_L2 128,64, 0,0 + KERNEL4x8_E2 128,64, 1,1 + MY_ALIGN + + +CGEMM_L4x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L4x8_SUB2_1 + LOAD4x8_2 + KERNEL4x8_E2 128,64, 0,1 + MY_ALIGN + + +CGEMM_L4x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L4x8_SAVE + KERNEL4x8 + + MY_ALIGN +CGEMM_L4x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + MY_ALIGN + SAVE4x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,4 +#endif + bgt CGEMM_L4x8_BEGIN + andi. T2, M, 7 + ble CGEMM_L4x1_END + andi. T1, M, 4 + ble CGEMM_L4x4_END + b CGEMM_L4x4_BEGIN + MY_ALIGN + + +CGEMM_L4x8_END: +/*----------------------------------------*/ + + +CGEMM_L4x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble CGEMM_L4x1_END + andi. T1, M, 4 + ble CGEMM_L4x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,4,4 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO4x4 + ble CGEMM_L4x4_SUB0 + bl CGEMM_4x4_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L4x4_SAVE + b CGEMM_L4x4_SUB2 + + +CGEMM_L4x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP4x4_32K + addi BO,BO,-32 + addi AO,AO,-32 + LOAD4x4O 32,32 + END4x4_WITHOUT_ADD + LOAD4x4_2O 64, 64 + mtctr T8 + bl CGEMM_L4x4_K32 + b CGEMM_L4x4_SAVE + CMP4x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L4x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-64 + LOAD4x4_2O 64,64 + bl CGEMM_L4x4_K32 + b CGEMM_L4x4_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L4x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L4x4_SUB2_8 + bl CGEMM_4x4_L16_SUB + MY_ALIGN + + +CGEMM_L4x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L4x4_SUB2_4 + bl CGEMM_4x4_L8_SUB + MY_ALIGN + + +CGEMM_L4x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L4x4_SUB2_2 + LOAD4x4_2 + KERNEL4x4_L2 64,64, 0,0 + KERNEL4x4_E2 64,64, 1,1 + MY_ALIGN + + +CGEMM_L4x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L4x4_SUB2_1 + LOAD4x4_2 + KERNEL4x4_E2 64,64, 0,1 + MY_ALIGN + + +CGEMM_L4x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L4x4_SAVE + KERNEL4x4 + + +CGEMM_L4x4_SAVE: +/*----------------------------------------*/ + SAVE4x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,4 +#endif + + +CGEMM_L4x4_END: +/*----------------------------------------*/ + + +CGEMM_L4x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble CGEMM_L4x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,2,4 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO4x2 + ble CGEMM_L4x2_SUB0 + bl CGEMM_4x2_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L4x2_SAVE + b CGEMM_L4x2_SUB2 + + +CGEMM_L4x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP4x2_32K + addi BO,BO,-32 + addi AO,AO,-16 + LOAD4x2O 16,32 + END4x2_WITHOUT_ADD + LOAD4x2_2O 32, 64 + mtctr T8 + bl CGEMM_L4x2_K32 + b CGEMM_L4x2_SAVE + CMP4x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L4x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-32 + LOAD4x2_2O 32,64 + bl CGEMM_L4x2_K32 + b CGEMM_L4x2_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L4x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L4x2_SUB2_8 + bl CGEMM_4x2_L16_SUB + MY_ALIGN + + +CGEMM_L4x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L4x2_SUB2_4 + bl CGEMM_4x2_L8_SUB + MY_ALIGN + + +CGEMM_L4x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L4x2_SUB2_2 + LOAD4x2_2 + KERNEL4x2_L2 32,64, 0,0 + KERNEL4x2_E2 32,64, 1,1 + MY_ALIGN + + +CGEMM_L4x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L4x2_SUB2_1 + LOAD4x2_2 + KERNEL4x2_E2 32,64, 0,1 + MY_ALIGN + + +CGEMM_L4x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L4x2_SAVE + KERNEL4x2 + + MY_ALIGN +CGEMM_L4x2_SAVE: +/*----------------------------------------*/ + SAVE4x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,4 +#endif + + +CGEMM_L4x2_END: +/*----------------------------------------*/ + + +CGEMM_L4x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble CGEMM_L4x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,1,4 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO4x1 + ble CGEMM_L4x1_SUB0 + bl CGEMM_4x1_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L4x1_SAVE + b CGEMM_L4x1_SUB2 + + +CGEMM_L4x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP4x1_32K + addi BO,BO,-32 + addi AO,AO,-8 + LOAD4x1O 8,32 + END4x1_WITHOUT_ADD + LOAD4x1_2O 16, 64 + mtctr T8 + bl CGEMM_L4x1_K32 + b CGEMM_L4x1_SAVE + CMP4x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L4x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-16 + LOAD4x1_2O 16,64 + bl CGEMM_L4x1_K32 + b CGEMM_L4x1_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L4x1_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L4x1_SUB2_8 + bl CGEMM_4x1_L16_SUB + MY_ALIGN + + +CGEMM_L4x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L4x1_SUB2_4 + bl CGEMM_4x1_L8_SUB + MY_ALIGN + + +CGEMM_L4x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L4x1_SUB2_2 + LOAD4x1_2 + KERNEL4x1_L2 16,64, 0,0 + KERNEL4x1_E2 16,64, 1,1 + MY_ALIGN + + +CGEMM_L4x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L4x1_SUB2_1 + LOAD4x1_2 + KERNEL4x1_E2 16,64, 0,1 + MY_ALIGN + + +CGEMM_L4x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L4x1_SAVE + KERNEL4x1 + + MY_ALIGN +CGEMM_L4x1_SAVE: +/*----------------------------------------*/ + + SAVE4x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,4 +#endif + + +CGEMM_L4x1_END: +/*----------------------------------------*/ + slwi T1, K, 5 + addic. J, J, -1 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 4 +#endif + bgt CGEMM_L4_BEGIN + + +CGEMM_L4_END: + +b CGEMM_L2 +/* MINI SUBROUTINES */ +/* 2x8 MAIN 128x+2 LOOP */ + + +CGEMM_L2x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x8_2 + MY_ALIGN +CGEMM_L2x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 128,32,0,0 +CGEMM_L2x8_K128: +/*----------------------------------------*/ + KERNEL2x8_L2 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L2 128,32,2,0 + KERNEL2x8_L2 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 128,32,4,0 + KERNEL2x8_L2 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L2 128,32,6,0 + KERNEL2x8_L2 128,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 128,32,8,0 + KERNEL2x8_L2 128,32,9,0 + KERNEL2x8_L2 128,32,10,0 + KERNEL2x8_L2 128,32,11,0 + dcbt BO, T4 + KERNEL2x8_L2 128,32,12,0 + KERNEL2x8_L2 128,32,13,0 + KERNEL2x8_L2 128,32,14,0 + KERNEL2x8_L2 128,32,15,0 + KERNEL2x8_L2 128,32,16,0 + KERNEL2x8_L2 128,32,17,0 + KERNEL2x8_L2 128,32,18,0 + KERNEL2x8_L2 128,32,19,0 + KERNEL2x8_L2 128,32,20,0 + KERNEL2x8_L2 128,32,21,0 + KERNEL2x8_L2 128,32,22,0 + KERNEL2x8_L2 128,32,23,0 + KERNEL2x8_L2 128,32,24,0 + KERNEL2x8_L2 128,32,25,0 + KERNEL2x8_L2 128,32,26,0 + KERNEL2x8_L2 128,32,27,0 + KERNEL2x8_L2 128,32,28,0 + KERNEL2x8_L2 128,32,29,0 + KERNEL2x8_L2 128,32,30,0 + KERNEL2x8_L2 128,32,31,0 + KERNEL2x8_L2 128,32,32,0 + KERNEL2x8_L2 128,32,33,0 + KERNEL2x8_L2 128,32,34,0 + KERNEL2x8_L2 128,32,35,0 + KERNEL2x8_L2 128,32,36,0 + KERNEL2x8_L2 128,32,37,0 + KERNEL2x8_L2 128,32,38,0 + KERNEL2x8_L2 128,32,39,0 + KERNEL2x8_L2 128,32,40,0 + KERNEL2x8_L2 128,32,41,0 + KERNEL2x8_L2 128,32,42,0 + KERNEL2x8_L2 128,32,43,0 + KERNEL2x8_L2 128,32,44,0 + KERNEL2x8_L2 128,32,45,0 + KERNEL2x8_L2 128,32,46,0 + KERNEL2x8_L2 128,32,47,0 + KERNEL2x8_L2 128,32,48,0 + KERNEL2x8_L2 128,32,49,0 + KERNEL2x8_L2 128,32,50,0 + KERNEL2x8_L2 128,32,51,0 + KERNEL2x8_L2 128,32,52,0 + KERNEL2x8_L2 128,32,53,0 + KERNEL2x8_L2 128,32,54,0 + KERNEL2x8_L2 128,32,55,0 + KERNEL2x8_L2 128,32,56,0 + KERNEL2x8_L2 128,32,57,0 + KERNEL2x8_L2 128,32,58,0 + KERNEL2x8_L2 128,32,59,0 + KERNEL2x8_L2 128,32,60,0 + KERNEL2x8_L2 128,32,61,0 + KERNEL2x8_L2 128,32,62,0 + KERNEL2x8_L2 128,32,63,1 + bdnz CGEMM_L2x8_LOOP + MY_ALIGN +CGEMM_L2x8_LOOP_END: +/*----------------------------------------*/ + END2x8_2 + blr + MY_ALIGN + + +CGEMM_2x8_L64_SUB: +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 128,32,0,0 + KERNEL2x8_L2 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L2 128,32,2,0 + KERNEL2x8_L2 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 128,32,4,0 + KERNEL2x8_L2 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L2 128,32,6,0 + KERNEL2x8_L2 128,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 128,32,8,0 + KERNEL2x8_L2 128,32,9,0 + KERNEL2x8_L2 128,32,10,0 + KERNEL2x8_L2 128,32,11,0 + dcbt BO, T4 + KERNEL2x8_L2 128,32,12,0 + KERNEL2x8_L2 128,32,13,0 + KERNEL2x8_L2 128,32,14,0 + KERNEL2x8_L2 128,32,15,0 + KERNEL2x8_L2 128,32,16,0 + KERNEL2x8_L2 128,32,17,0 + KERNEL2x8_L2 128,32,18,0 + KERNEL2x8_L2 128,32,19,0 + KERNEL2x8_L2 128,32,20,0 + KERNEL2x8_L2 128,32,21,0 + KERNEL2x8_L2 128,32,22,0 + KERNEL2x8_L2 128,32,23,0 + KERNEL2x8_L2 128,32,24,0 + KERNEL2x8_L2 128,32,25,0 + KERNEL2x8_L2 128,32,26,0 + KERNEL2x8_L2 128,32,27,0 + KERNEL2x8_L2 128,32,28,0 + KERNEL2x8_L2 128,32,29,0 + KERNEL2x8_L2 128,32,30,0 + KERNEL2x8_E2 128,32,31,1 + blr + MY_ALIGN + + +CGEMM_2x8_L32_SUB: +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 128,32,0,0 + KERNEL2x8_L2 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L2 128,32,2,0 + KERNEL2x8_L2 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 128,32,4,0 + KERNEL2x8_L2 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L2 128,32,6,0 + KERNEL2x8_L2 128,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 128,32,8,0 + KERNEL2x8_L2 128,32,9,0 + KERNEL2x8_L2 128,32,10,0 + KERNEL2x8_L2 128,32,11,0 + dcbt BO, T4 + KERNEL2x8_L2 128,32,12,0 + KERNEL2x8_L2 128,32,13,0 + KERNEL2x8_L2 128,32,14,0 + KERNEL2x8_E2 128,32,15,1 + blr + MY_ALIGN + + +CGEMM_2x8_L16_SUB: +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 128,32,0,0 + KERNEL2x8_L2 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L2 128,32,2,0 + KERNEL2x8_L2 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 128,32,4,0 + KERNEL2x8_L2 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L2 128,32,6,0 + KERNEL2x8_E2 128,32,7,1 + blr + MY_ALIGN + + +CGEMM_2x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x4_2 + MY_ALIGN +CGEMM_L2x4_LOOP: +/*----------------------------------------*/ + KERNEL2x4_L2 64,32,0,0 +CGEMM_L2x4_K32: +/*----------------------------------------*/ + KERNEL2x4_L2 64,32,1,0 + KERNEL2x4_L2 64,32,2,0 + KERNEL2x4_L2 64,32,3,0 + KERNEL2x4_L2 64,32,4,0 + KERNEL2x4_L2 64,32,5,0 + KERNEL2x4_L2 64,32,6,0 + KERNEL2x4_L2 64,32,7,0 + KERNEL2x4_L2 64,32,8,0 + KERNEL2x4_L2 64,32,9,0 + KERNEL2x4_L2 64,32,10,0 + KERNEL2x4_L2 64,32,11,0 + KERNEL2x4_L2 64,32,12,0 + KERNEL2x4_L2 64,32,13,0 + KERNEL2x4_L2 64,32,14,0 + KERNEL2x4_L2 64,32,15,1 + bdnz CGEMM_L2x4_LOOP + MY_ALIGN +CGEMM_L2x4_LOOP_END: +/*----------------------------------------*/ + END2x4_2 + blr + MY_ALIGN + + +CGEMM_2x4_L16_SUB: +/*----------------------------------------*/ + LOAD2x4_2 + KERNEL2x4_L2 64,32,0,0 + KERNEL2x4_L2 64,32,1,0 + KERNEL2x4_L2 64,32,2,0 + KERNEL2x4_L2 64,32,3,0 + KERNEL2x4_L2 64,32,4,0 + KERNEL2x4_L2 64,32,5,0 + KERNEL2x4_L2 64,32,6,0 + KERNEL2x4_E2 64,32,7,1 + blr + MY_ALIGN + + +CGEMM_2x4_L8_SUB: +/*----------------------------------------*/ + LOAD2x4_2 + KERNEL2x4_L2 64,32,0,0 + KERNEL2x4_L2 64,32,1,0 + KERNEL2x4_L2 64,32,2,0 + KERNEL2x4_E2 64,32,3,1 + blr + + +CGEMM_2x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x2_2 + MY_ALIGN +CGEMM_L2x2_LOOP: +/*----------------------------------------*/ + KERNEL2x2_L2 32,32,0,0 +CGEMM_L2x2_K32: +/*----------------------------------------*/ + KERNEL2x2_L2 32,32,1,0 + KERNEL2x2_L2 32,32,2,0 + KERNEL2x2_L2 32,32,3,0 + KERNEL2x2_L2 32,32,4,0 + KERNEL2x2_L2 32,32,5,0 + KERNEL2x2_L2 32,32,6,0 + KERNEL2x2_L2 32,32,7,0 + KERNEL2x2_L2 32,32,8,0 + KERNEL2x2_L2 32,32,9,0 + KERNEL2x2_L2 32,32,10,0 + KERNEL2x2_L2 32,32,11,0 + KERNEL2x2_L2 32,32,12,0 + KERNEL2x2_L2 32,32,13,0 + KERNEL2x2_L2 32,32,14,0 + KERNEL2x2_L2 32,32,15,1 + bdnz CGEMM_L2x2_LOOP + MY_ALIGN + + +CGEMM_L2x2_LOOP_END: +/*----------------------------------------*/ + END2x2_2 + blr + MY_ALIGN +CGEMM_2x2_L16_SUB: +/*----------------------------------------*/ + LOAD2x2_2 + KERNEL2x2_L2 32,32,0,0 + KERNEL2x2_L2 32,32,1,0 + KERNEL2x2_L2 32,32,2,0 + KERNEL2x2_L2 32,32,3,0 + KERNEL2x2_L2 32,32,4,0 + KERNEL2x2_L2 32,32,5,0 + KERNEL2x2_L2 32,32,6,0 + KERNEL2x2_E2 32,32,7,1 + blr + MY_ALIGN +CGEMM_2x2_L8_SUB: +/*----------------------------------------*/ + LOAD2x2_2 + KERNEL2x2_L2 32,32,0,0 + KERNEL2x2_L2 32,32,1,0 + KERNEL2x2_L2 32,32,2,0 + KERNEL2x2_E2 32,32,3,1 + blr + + +CGEMM_2x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x1_2 + MY_ALIGN +CGEMM_L2x1_LOOP: +/*----------------------------------------*/ + KERNEL2x1_L2 16,32,0,0 +CGEMM_L2x1_K32: +/*----------------------------------------*/ + KERNEL2x1_L2 16,32,1,0 + KERNEL2x1_L2 16,32,2,0 + KERNEL2x1_L2 16,32,3,0 + KERNEL2x1_L2 16,32,4,0 + KERNEL2x1_L2 16,32,5,0 + KERNEL2x1_L2 16,32,6,0 + KERNEL2x1_L2 16,32,7,0 + KERNEL2x1_L2 16,32,8,0 + KERNEL2x1_L2 16,32,9,0 + KERNEL2x1_L2 16,32,10,0 + KERNEL2x1_L2 16,32,11,0 + KERNEL2x1_L2 16,32,12,0 + KERNEL2x1_L2 16,32,13,0 + KERNEL2x1_L2 16,32,14,0 + KERNEL2x1_L2 16,32,15,1 + bdnz CGEMM_L2x1_LOOP + MY_ALIGN +CGEMM_L2x1_LOOP_END: +/*----------------------------------------*/ + END2x1_2 + blr + + MY_ALIGN +CGEMM_2x1_L16_SUB: +/*----------------------------------------*/ + LOAD2x1_2 + KERNEL2x1_L2 16,32,0,0 + KERNEL2x1_L2 16,32,1,0 + KERNEL2x1_L2 16,32,2,0 + KERNEL2x1_L2 16,32,3,0 + KERNEL2x1_L2 16,32,4,0 + KERNEL2x1_L2 16,32,5,0 + KERNEL2x1_L2 16,32,6,0 + KERNEL2x1_E2 16,32,7,1 + blr + MY_ALIGN + + +CGEMM_2x1_L8_SUB: +/*----------------------------------------*/ + LOAD2x1_2 + KERNEL2x1_L2 16,32,0,0 + KERNEL2x1_L2 16,32,1,0 + KERNEL2x1_L2 16,32,2,0 + KERNEL2x1_E2 16,32,3,1 + blr + + + +/* MAIN LOOP BEGINS */ + MY_ALIGN + + +CGEMM_L2: +/*----------------------------------------*/ + + andi. J, N, 2 + ble CGEMM_L2_END + + +CGEMM_L2_BEGIN: +/*----------------------------------------*/ + mr CO, C + slwi T1, LDC , 1 + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble CGEMM_L2x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +CGEMM_L2x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,8,2 + mr T1, T6 +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T1-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + ZERO2x8 + ble CGEMM_L2x8_SUB0 + bl CGEMM_L2x8_LMAIN_SUB + andi. L, T1, 127 + ble CGEMM_L2x8_SAVE + b CGEMM_L2x8_SUB2 + + +CGEMM_L2x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T8,1 + bne CMP2x8_128K + addi BO,BO,-16 + addi AO,AO,-64 + LOAD2x8O 64,16 + END2x8_WITHOUT_ADD + LOAD2x8_2O 128, 32 + mtctr T8 + bl CGEMM_L2x8_K128 + b CGEMM_L2x8_SAVE + CMP2x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,128 +#else + cmpwi K,128 +#endif + bne CGEMM_L2x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-128 + LOAD2x8_2O 128,32 + bl CGEMM_L2x8_K128 + b CGEMM_L2x8_SAVE + MY_ALIGN + + +CGEMM_L2x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble CGEMM_L2x8_SUB2_32 + bl CGEMM_2x8_L64_SUB + MY_ALIGN + + +CGEMM_L2x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble CGEMM_L2x8_SUB2_16 + bl CGEMM_2x8_L32_SUB + MY_ALIGN + + +CGEMM_L2x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L2x8_SUB2_8 + bl CGEMM_2x8_L16_SUB + MY_ALIGN + + +CGEMM_L2x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L2x8_SUB2_4 + LOAD2x8_2 + KERNEL2x8_L2 128,32, 0,0 + KERNEL2x8_L2 128,32, 1,0 + KERNEL2x8_L2 128,32, 2,0 + KERNEL2x8_E2 128,32, 3,1 + MY_ALIGN + + +CGEMM_L2x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L2x8_SUB2_2 + LOAD2x8_2 + KERNEL2x8_L2 128,32, 0,0 + KERNEL2x8_E2 128,32, 1,1 + MY_ALIGN + + +CGEMM_L2x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L2x8_SUB2_1 + LOAD2x8_2 + KERNEL2x8_E2 128,32, 0,1 + MY_ALIGN + + +CGEMM_L2x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L2x8_SAVE + KERNEL2x8 + + MY_ALIGN +CGEMM_L2x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + MY_ALIGN + SAVE2x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2 +#endif + bgt CGEMM_L2x8_BEGIN + andi. T2, M, 7 + ble CGEMM_L2x1_END + andi. T1, M, 4 + ble CGEMM_L2x4_END + b CGEMM_L2x4_BEGIN + MY_ALIGN + + +CGEMM_L2x8_END: +/*----------------------------------------*/ + + +CGEMM_L2x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble CGEMM_L2x1_END + andi. T1, M, 4 + ble CGEMM_L2x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,4,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x4 + ble CGEMM_L2x4_SUB0 + bl CGEMM_2x4_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L2x4_SAVE + b CGEMM_L2x4_SUB2 + + +CGEMM_L2x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x4_32K + addi BO,BO,-16 + addi AO,AO,-32 + LOAD2x4O 32,16 + END2x4_WITHOUT_ADD + LOAD2x4_2O 64, 32 + mtctr T8 + bl CGEMM_L2x4_K32 + b CGEMM_L2x4_SAVE + CMP2x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L2x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-64 + LOAD2x4_2O 64,32 + bl CGEMM_L2x4_K32 + b CGEMM_L2x4_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L2x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L2x4_SUB2_8 + bl CGEMM_2x4_L16_SUB + MY_ALIGN + + +CGEMM_L2x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L2x4_SUB2_4 + bl CGEMM_2x4_L8_SUB + MY_ALIGN + + +CGEMM_L2x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L2x4_SUB2_2 + LOAD2x4_2 + KERNEL2x4_L2 64,32, 0,0 + KERNEL2x4_E2 64,32, 1,1 + MY_ALIGN + + +CGEMM_L2x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L2x4_SUB2_1 + LOAD2x4_2 + KERNEL2x4_E2 64,32, 0,1 + MY_ALIGN + + +CGEMM_L2x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L2x4_SAVE + KERNEL2x4 + + +CGEMM_L2x4_SAVE: +/*----------------------------------------*/ + SAVE2x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2 +#endif + + +CGEMM_L2x4_END: +/*----------------------------------------*/ + + +CGEMM_L2x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble CGEMM_L2x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,2,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x2 + ble CGEMM_L2x2_SUB0 + bl CGEMM_2x2_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L2x2_SAVE + b CGEMM_L2x2_SUB2 + + +CGEMM_L2x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x2_32K + addi BO,BO,-16 + addi AO,AO,-16 + LOAD2x2O 16,16 + END2x2_WITHOUT_ADD + LOAD2x2_2O 32, 32 + mtctr T8 + bl CGEMM_L2x2_K32 + b CGEMM_L2x2_SAVE + CMP2x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L2x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-32 + LOAD2x2_2O 32,32 + bl CGEMM_L2x2_K32 + b CGEMM_L2x2_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L2x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L2x2_SUB2_8 + bl CGEMM_2x2_L16_SUB + MY_ALIGN + + +CGEMM_L2x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L2x2_SUB2_4 + bl CGEMM_2x2_L8_SUB + MY_ALIGN + + +CGEMM_L2x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L2x2_SUB2_2 + LOAD2x2_2 + KERNEL2x2_L2 32,32, 0,0 + KERNEL2x2_E2 32,32, 1,1 + MY_ALIGN + + +CGEMM_L2x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L2x2_SUB2_1 + LOAD2x2_2 + KERNEL2x2_E2 32,32, 0,1 + MY_ALIGN + + +CGEMM_L2x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L2x2_SAVE + KERNEL2x2 + + MY_ALIGN +CGEMM_L2x2_SAVE: +/*----------------------------------------*/ + SAVE2x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2 +#endif + + +CGEMM_L2x2_END: +/*----------------------------------------*/ + + +CGEMM_L2x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble CGEMM_L2x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,1,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x1 + ble CGEMM_L2x1_SUB0 + bl CGEMM_2x1_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L2x1_SAVE + b CGEMM_L2x1_SUB2 + + +CGEMM_L2x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x1_32K + addi BO,BO,-16 + addi AO,AO,-8 + LOAD2x1O 8,16 + END2x1_WITHOUT_ADD + LOAD2x1_2O 16, 32 + mtctr T8 + bl CGEMM_L2x1_K32 + b CGEMM_L2x1_SAVE + CMP2x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L2x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-16 + LOAD2x1_2O 16,32 + bl CGEMM_L2x1_K32 + b CGEMM_L2x1_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L2x1_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L2x1_SUB2_8 + bl CGEMM_2x1_L16_SUB + MY_ALIGN + + +CGEMM_L2x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L2x1_SUB2_4 + bl CGEMM_2x1_L8_SUB + MY_ALIGN + + +CGEMM_L2x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L2x1_SUB2_2 + LOAD2x1_2 + KERNEL2x1_L2 16,32, 0,0 + KERNEL2x1_E2 16,32, 1,1 + MY_ALIGN + + +CGEMM_L2x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L2x1_SUB2_1 + LOAD2x1_2 + KERNEL2x1_E2 16,32, 0,1 + MY_ALIGN + + +CGEMM_L2x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L2x1_SAVE + KERNEL2x1 + + MY_ALIGN +CGEMM_L2x1_SAVE: +/*----------------------------------------*/ + + SAVE2x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2 +#endif + + +CGEMM_L2x1_END: +/*----------------------------------------*/ + slwi T1, K, 4 + + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 2 +#endif + +CGEMM_L2_END: + + +b CGEMM_L1 +/* MINI SUBROUTINES */ +/* 1x8 MAIN 128x+2 LOOP */ + + +CGEMM_L1x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x8_2 + MY_ALIGN +CGEMM_L1x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 128,16,0,0 +CGEMM_L1x8_K128: +/*----------------------------------------*/ + KERNEL1x8_L2 128,16,1,0 + dcbt AO, T2 + KERNEL1x8_L2 128,16,2,0 + KERNEL1x8_L2 128,16,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 128,16,4,0 + KERNEL1x8_L2 128,16,5,0 + dcbt AO, T4 + KERNEL1x8_L2 128,16,6,0 + KERNEL1x8_L2 128,16,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 128,16,8,0 + KERNEL1x8_L2 128,16,9,0 + KERNEL1x8_L2 128,16,10,0 + KERNEL1x8_L2 128,16,11,0 + dcbt BO, T4 + KERNEL1x8_L2 128,16,12,0 + KERNEL1x8_L2 128,16,13,0 + KERNEL1x8_L2 128,16,14,0 + KERNEL1x8_L2 128,16,15,0 + KERNEL1x8_L2 128,16,16,0 + KERNEL1x8_L2 128,16,17,0 + KERNEL1x8_L2 128,16,18,0 + KERNEL1x8_L2 128,16,19,0 + KERNEL1x8_L2 128,16,20,0 + KERNEL1x8_L2 128,16,21,0 + KERNEL1x8_L2 128,16,22,0 + KERNEL1x8_L2 128,16,23,0 + KERNEL1x8_L2 128,16,24,0 + KERNEL1x8_L2 128,16,25,0 + KERNEL1x8_L2 128,16,26,0 + KERNEL1x8_L2 128,16,27,0 + KERNEL1x8_L2 128,16,28,0 + KERNEL1x8_L2 128,16,29,0 + KERNEL1x8_L2 128,16,30,0 + KERNEL1x8_L2 128,16,31,0 + KERNEL1x8_L2 128,16,32,0 + KERNEL1x8_L2 128,16,33,0 + KERNEL1x8_L2 128,16,34,0 + KERNEL1x8_L2 128,16,35,0 + KERNEL1x8_L2 128,16,36,0 + KERNEL1x8_L2 128,16,37,0 + KERNEL1x8_L2 128,16,38,0 + KERNEL1x8_L2 128,16,39,0 + KERNEL1x8_L2 128,16,40,0 + KERNEL1x8_L2 128,16,41,0 + KERNEL1x8_L2 128,16,42,0 + KERNEL1x8_L2 128,16,43,0 + KERNEL1x8_L2 128,16,44,0 + KERNEL1x8_L2 128,16,45,0 + KERNEL1x8_L2 128,16,46,0 + KERNEL1x8_L2 128,16,47,0 + KERNEL1x8_L2 128,16,48,0 + KERNEL1x8_L2 128,16,49,0 + KERNEL1x8_L2 128,16,50,0 + KERNEL1x8_L2 128,16,51,0 + KERNEL1x8_L2 128,16,52,0 + KERNEL1x8_L2 128,16,53,0 + KERNEL1x8_L2 128,16,54,0 + KERNEL1x8_L2 128,16,55,0 + KERNEL1x8_L2 128,16,56,0 + KERNEL1x8_L2 128,16,57,0 + KERNEL1x8_L2 128,16,58,0 + KERNEL1x8_L2 128,16,59,0 + KERNEL1x8_L2 128,16,60,0 + KERNEL1x8_L2 128,16,61,0 + KERNEL1x8_L2 128,16,62,0 + KERNEL1x8_L2 128,16,63,1 + bdnz CGEMM_L1x8_LOOP + MY_ALIGN +CGEMM_L1x8_LOOP_END: +/*----------------------------------------*/ + END1x8_2 + blr + MY_ALIGN + + +CGEMM_1x8_L64_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 128,16,0,0 + KERNEL1x8_L2 128,16,1,0 + dcbt AO, T2 + KERNEL1x8_L2 128,16,2,0 + KERNEL1x8_L2 128,16,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 128,16,4,0 + KERNEL1x8_L2 128,16,5,0 + dcbt AO, T4 + KERNEL1x8_L2 128,16,6,0 + KERNEL1x8_L2 128,16,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 128,16,8,0 + KERNEL1x8_L2 128,16,9,0 + KERNEL1x8_L2 128,16,10,0 + KERNEL1x8_L2 128,16,11,0 + dcbt BO, T4 + KERNEL1x8_L2 128,16,12,0 + KERNEL1x8_L2 128,16,13,0 + KERNEL1x8_L2 128,16,14,0 + KERNEL1x8_L2 128,16,15,0 + KERNEL1x8_L2 128,16,16,0 + KERNEL1x8_L2 128,16,17,0 + KERNEL1x8_L2 128,16,18,0 + KERNEL1x8_L2 128,16,19,0 + KERNEL1x8_L2 128,16,20,0 + KERNEL1x8_L2 128,16,21,0 + KERNEL1x8_L2 128,16,22,0 + KERNEL1x8_L2 128,16,23,0 + KERNEL1x8_L2 128,16,24,0 + KERNEL1x8_L2 128,16,25,0 + KERNEL1x8_L2 128,16,26,0 + KERNEL1x8_L2 128,16,27,0 + KERNEL1x8_L2 128,16,28,0 + KERNEL1x8_L2 128,16,29,0 + KERNEL1x8_L2 128,16,30,0 + KERNEL1x8_E2 128,16,31,1 + blr + MY_ALIGN + + +CGEMM_1x8_L32_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 128,16,0,0 + KERNEL1x8_L2 128,16,1,0 + dcbt AO, T2 + KERNEL1x8_L2 128,16,2,0 + KERNEL1x8_L2 128,16,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 128,16,4,0 + KERNEL1x8_L2 128,16,5,0 + dcbt AO, T4 + KERNEL1x8_L2 128,16,6,0 + KERNEL1x8_L2 128,16,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 128,16,8,0 + KERNEL1x8_L2 128,16,9,0 + KERNEL1x8_L2 128,16,10,0 + KERNEL1x8_L2 128,16,11,0 + dcbt BO, T4 + KERNEL1x8_L2 128,16,12,0 + KERNEL1x8_L2 128,16,13,0 + KERNEL1x8_L2 128,16,14,0 + KERNEL1x8_E2 128,16,15,1 + blr + MY_ALIGN + + +CGEMM_1x8_L16_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 128,16,0,0 + KERNEL1x8_L2 128,16,1,0 + dcbt AO, T2 + KERNEL1x8_L2 128,16,2,0 + KERNEL1x8_L2 128,16,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 128,16,4,0 + KERNEL1x8_L2 128,16,5,0 + dcbt AO, T4 + KERNEL1x8_L2 128,16,6,0 + KERNEL1x8_E2 128,16,7,1 + blr + MY_ALIGN + + +CGEMM_1x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x4_2 + MY_ALIGN +CGEMM_L1x4_LOOP: +/*----------------------------------------*/ + KERNEL1x4_L2 64,16,0,0 +CGEMM_L1x4_K32: +/*----------------------------------------*/ + KERNEL1x4_L2 64,16,1,0 + KERNEL1x4_L2 64,16,2,0 + KERNEL1x4_L2 64,16,3,0 + KERNEL1x4_L2 64,16,4,0 + KERNEL1x4_L2 64,16,5,0 + KERNEL1x4_L2 64,16,6,0 + KERNEL1x4_L2 64,16,7,0 + KERNEL1x4_L2 64,16,8,0 + KERNEL1x4_L2 64,16,9,0 + KERNEL1x4_L2 64,16,10,0 + KERNEL1x4_L2 64,16,11,0 + KERNEL1x4_L2 64,16,12,0 + KERNEL1x4_L2 64,16,13,0 + KERNEL1x4_L2 64,16,14,0 + KERNEL1x4_L2 64,16,15,1 + bdnz CGEMM_L1x4_LOOP + MY_ALIGN +CGEMM_L1x4_LOOP_END: +/*----------------------------------------*/ + END1x4_2 + blr + MY_ALIGN + + +CGEMM_1x4_L16_SUB: +/*----------------------------------------*/ + LOAD1x4_2 + KERNEL1x4_L2 64,16,0,0 + KERNEL1x4_L2 64,16,1,0 + KERNEL1x4_L2 64,16,2,0 + KERNEL1x4_L2 64,16,3,0 + KERNEL1x4_L2 64,16,4,0 + KERNEL1x4_L2 64,16,5,0 + KERNEL1x4_L2 64,16,6,0 + KERNEL1x4_E2 64,16,7,1 + blr + MY_ALIGN + + +CGEMM_1x4_L8_SUB: +/*----------------------------------------*/ + LOAD1x4_2 + KERNEL1x4_L2 64,16,0,0 + KERNEL1x4_L2 64,16,1,0 + KERNEL1x4_L2 64,16,2,0 + KERNEL1x4_E2 64,16,3,1 + blr + + +CGEMM_1x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x2_2 + MY_ALIGN +CGEMM_L1x2_LOOP: +/*----------------------------------------*/ + KERNEL1x2_L2 32,16,0,0 +CGEMM_L1x2_K32: +/*----------------------------------------*/ + KERNEL1x2_L2 32,16,1,0 + KERNEL1x2_L2 32,16,2,0 + KERNEL1x2_L2 32,16,3,0 + KERNEL1x2_L2 32,16,4,0 + KERNEL1x2_L2 32,16,5,0 + KERNEL1x2_L2 32,16,6,0 + KERNEL1x2_L2 32,16,7,0 + KERNEL1x2_L2 32,16,8,0 + KERNEL1x2_L2 32,16,9,0 + KERNEL1x2_L2 32,16,10,0 + KERNEL1x2_L2 32,16,11,0 + KERNEL1x2_L2 32,16,12,0 + KERNEL1x2_L2 32,16,13,0 + KERNEL1x2_L2 32,16,14,0 + KERNEL1x2_L2 32,16,15,1 + bdnz CGEMM_L1x2_LOOP + MY_ALIGN + + +CGEMM_L1x2_LOOP_END: +/*----------------------------------------*/ + END1x2_2 + blr + MY_ALIGN +CGEMM_1x2_L16_SUB: +/*----------------------------------------*/ + LOAD1x2_2 + KERNEL1x2_L2 32,16,0,0 + KERNEL1x2_L2 32,16,1,0 + KERNEL1x2_L2 32,16,2,0 + KERNEL1x2_L2 32,16,3,0 + KERNEL1x2_L2 32,16,4,0 + KERNEL1x2_L2 32,16,5,0 + KERNEL1x2_L2 32,16,6,0 + KERNEL1x2_E2 32,16,7,1 + blr + MY_ALIGN +CGEMM_1x2_L8_SUB: +/*----------------------------------------*/ + LOAD1x2_2 + KERNEL1x2_L2 32,16,0,0 + KERNEL1x2_L2 32,16,1,0 + KERNEL1x2_L2 32,16,2,0 + KERNEL1x2_E2 32,16,3,1 + blr + + +CGEMM_1x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x1_2 + MY_ALIGN +CGEMM_L1x1_LOOP: +/*----------------------------------------*/ + KERNEL1x1_L2 16,16,0,0 +CGEMM_L1x1_K32: +/*----------------------------------------*/ + KERNEL1x1_L2 16,16,1,0 + KERNEL1x1_L2 16,16,2,0 + KERNEL1x1_L2 16,16,3,0 + KERNEL1x1_L2 16,16,4,0 + KERNEL1x1_L2 16,16,5,0 + KERNEL1x1_L2 16,16,6,0 + KERNEL1x1_L2 16,16,7,0 + KERNEL1x1_L2 16,16,8,0 + KERNEL1x1_L2 16,16,9,0 + KERNEL1x1_L2 16,16,10,0 + KERNEL1x1_L2 16,16,11,0 + KERNEL1x1_L2 16,16,12,0 + KERNEL1x1_L2 16,16,13,0 + KERNEL1x1_L2 16,16,14,0 + KERNEL1x1_L2 16,16,15,1 + bdnz CGEMM_L1x1_LOOP + MY_ALIGN +CGEMM_L1x1_LOOP_END: +/*----------------------------------------*/ + END1x1_2 + blr + + MY_ALIGN +CGEMM_1x1_L16_SUB: +/*----------------------------------------*/ + LOAD1x1_2 + KERNEL1x1_L2 16,16,0,0 + KERNEL1x1_L2 16,16,1,0 + KERNEL1x1_L2 16,16,2,0 + KERNEL1x1_L2 16,16,3,0 + KERNEL1x1_L2 16,16,4,0 + KERNEL1x1_L2 16,16,5,0 + KERNEL1x1_L2 16,16,6,0 + KERNEL1x1_E2 16,16,7,1 + blr + MY_ALIGN + + +CGEMM_1x1_L8_SUB: +/*----------------------------------------*/ + LOAD1x1_2 + KERNEL1x1_L2 16,16,0,0 + KERNEL1x1_L2 16,16,1,0 + KERNEL1x1_L2 16,16,2,0 + KERNEL1x1_E2 16,16,3,1 + blr + + + +/* MAIN LOOP BEGINS */ + MY_ALIGN + + +CGEMM_L1: +/*----------------------------------------*/ + + andi. J, N, 1 + ble CGEMM_L1_END + +CGEMM_L1_BEGIN: +/*----------------------------------------*/ + mr CO, C + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble CGEMM_L1x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +CGEMM_L1x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,8,1 + mr T1, T6 +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T1-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + ZERO1x8 + ble CGEMM_L1x8_SUB0 + bl CGEMM_L1x8_LMAIN_SUB + andi. L, T1, 127 + ble CGEMM_L1x8_SAVE + b CGEMM_L1x8_SUB2 + + +CGEMM_L1x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T8,1 + bne CMP1x8_128K + addi BO,BO,-8 + addi AO,AO,-64 + LOAD1x8O 64,8 + END1x8_WITHOUT_ADD + LOAD1x8_2O 128, 16 + mtctr T8 + bl CGEMM_L1x8_K128 + b CGEMM_L1x8_SAVE + CMP1x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,128 +#else + cmpwi K,128 +#endif + bne CGEMM_L1x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-16 + addi AO,AO,-128 + LOAD1x8_2O 128,16 + bl CGEMM_L1x8_K128 + b CGEMM_L1x8_SAVE + MY_ALIGN + + +CGEMM_L1x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble CGEMM_L1x8_SUB2_32 + bl CGEMM_1x8_L64_SUB + MY_ALIGN + + +CGEMM_L1x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble CGEMM_L1x8_SUB2_16 + bl CGEMM_1x8_L32_SUB + MY_ALIGN + + +CGEMM_L1x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L1x8_SUB2_8 + bl CGEMM_1x8_L16_SUB + MY_ALIGN + + +CGEMM_L1x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L1x8_SUB2_4 + LOAD1x8_2 + KERNEL1x8_L2 128,16, 0,0 + KERNEL1x8_L2 128,16, 1,0 + KERNEL1x8_L2 128,16, 2,0 + KERNEL1x8_E2 128,16, 3,1 + MY_ALIGN + + +CGEMM_L1x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L1x8_SUB2_2 + LOAD1x8_2 + KERNEL1x8_L2 128,16, 0,0 + KERNEL1x8_E2 128,16, 1,1 + MY_ALIGN + + +CGEMM_L1x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L1x8_SUB2_1 + LOAD1x8_2 + KERNEL1x8_E2 128,16, 0,1 + MY_ALIGN + + +CGEMM_L1x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L1x8_SAVE + KERNEL1x8 + + MY_ALIGN +CGEMM_L1x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + MY_ALIGN + SAVE1x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1 +#endif + bgt CGEMM_L1x8_BEGIN + andi. T2, M, 7 + ble CGEMM_L1x1_END + andi. T1, M, 4 + ble CGEMM_L1x4_END + b CGEMM_L1x4_BEGIN + MY_ALIGN + + +CGEMM_L1x8_END: +/*----------------------------------------*/ + + +CGEMM_L1x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble CGEMM_L1x1_END + andi. T1, M, 4 + ble CGEMM_L1x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,4,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 31x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 31x */ +#endif + ZERO1x4 + ble CGEMM_L1x4_SUB0 + bl CGEMM_1x4_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L1x4_SAVE + b CGEMM_L1x4_SUB2 + + +CGEMM_L1x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x4_32K + addi BO,BO,-8 + addi AO,AO,-32 + LOAD1x4O 32,8 + END1x4_WITHOUT_ADD + LOAD1x4_2O 64, 16 + mtctr T8 + bl CGEMM_L1x4_K32 + b CGEMM_L1x4_SAVE + CMP1x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L1x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-16 + addi AO,AO,-64 + LOAD1x4_2O 64,16 + bl CGEMM_L1x4_K32 + b CGEMM_L1x4_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L1x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L1x4_SUB2_8 + bl CGEMM_1x4_L16_SUB + MY_ALIGN + + +CGEMM_L1x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L1x4_SUB2_4 + bl CGEMM_1x4_L8_SUB + MY_ALIGN + + +CGEMM_L1x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L1x4_SUB2_2 + LOAD1x4_2 + KERNEL1x4_L2 64,16, 0,0 + KERNEL1x4_E2 64,16, 1,1 + MY_ALIGN + + +CGEMM_L1x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L1x4_SUB2_1 + LOAD1x4_2 + KERNEL1x4_E2 64,16, 0,1 + MY_ALIGN + + +CGEMM_L1x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L1x4_SAVE + KERNEL1x4 + + +CGEMM_L1x4_SAVE: +/*----------------------------------------*/ + SAVE1x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1 +#endif + + +CGEMM_L1x4_END: +/*----------------------------------------*/ + + +CGEMM_L1x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble CGEMM_L1x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,2,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 31x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 31x */ +#endif + ZERO1x2 + ble CGEMM_L1x2_SUB0 + bl CGEMM_1x2_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L1x2_SAVE + b CGEMM_L1x2_SUB2 + + +CGEMM_L1x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x2_32K + addi BO,BO,-8 + addi AO,AO,-16 + LOAD1x2O 16,8 + END1x2_WITHOUT_ADD + LOAD1x2_2O 32, 16 + mtctr T8 + bl CGEMM_L1x2_K32 + b CGEMM_L1x2_SAVE + CMP1x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L1x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-16 + addi AO,AO,-32 + LOAD1x2_2O 32,16 + bl CGEMM_L1x2_K32 + b CGEMM_L1x2_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L1x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L1x2_SUB2_8 + bl CGEMM_1x2_L16_SUB + MY_ALIGN + + +CGEMM_L1x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L1x2_SUB2_4 + bl CGEMM_1x2_L8_SUB + MY_ALIGN + + +CGEMM_L1x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L1x2_SUB2_2 + LOAD1x2_2 + KERNEL1x2_L2 32,16, 0,0 + KERNEL1x2_E2 32,16, 1,1 + MY_ALIGN + + +CGEMM_L1x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L1x2_SUB2_1 + LOAD1x2_2 + KERNEL1x2_E2 32,16, 0,1 + MY_ALIGN + + +CGEMM_L1x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L1x2_SAVE + KERNEL1x2 + + MY_ALIGN +CGEMM_L1x2_SAVE: +/*----------------------------------------*/ + SAVE1x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1 +#endif + + +CGEMM_L1x2_END: +/*----------------------------------------*/ + + +CGEMM_L1x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble CGEMM_L1x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,1,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 31x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 31x */ +#endif + ZERO1x1 + ble CGEMM_L1x1_SUB0 + bl CGEMM_1x1_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L1x1_SAVE + b CGEMM_L1x1_SUB2 + + +CGEMM_L1x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x1_32K + addi BO,BO,-8 + addi AO,AO,-8 + LOAD1x1O 8,8 + END1x1_WITHOUT_ADD + LOAD1x1_2O 16, 16 + mtctr T8 + bl CGEMM_L1x1_K32 + b CGEMM_L1x1_SAVE + CMP1x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L1x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-16 + addi AO,AO,-16 + LOAD1x1_2O 16,16 + bl CGEMM_L1x1_K32 + b CGEMM_L1x1_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L1x1_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L1x1_SUB2_8 + bl CGEMM_1x1_L16_SUB + MY_ALIGN + + +CGEMM_L1x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L1x1_SUB2_4 + bl CGEMM_1x1_L8_SUB + MY_ALIGN + + +CGEMM_L1x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L1x1_SUB2_2 + LOAD1x1_2 + KERNEL1x1_L2 16,16, 0,0 + KERNEL1x1_E2 16,16, 1,1 + MY_ALIGN + + +CGEMM_L1x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L1x1_SUB2_1 + LOAD1x1_2 + KERNEL1x1_E2 16,16, 0,1 + MY_ALIGN + + +CGEMM_L1x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L1x1_SAVE + KERNEL1x1 + + MY_ALIGN +CGEMM_L1x1_SAVE: +/*----------------------------------------*/ + + SAVE1x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1 +#endif + + +CGEMM_L1x1_END: +/*----------------------------------------*/ + slwi T1, K, 3 + + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 1 +#endif + +CGEMM_L1_END: + + + + diff --git a/kernel/power/cgemm_macros_power10.S b/kernel/power/cgemm_macros_power10.S new file mode 100644 index 000000000..b66e93405 --- /dev/null +++ b/kernel/power/cgemm_macros_power10.S @@ -0,0 +1,2131 @@ +/*************************************************************************** +Copyright (c) 2013-2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define unit_size 8 +#define DISP32(ind, disp) (ind*unit_size*32+disp) +#define DISP16(ind, disp) (ind*unit_size*16+disp) +#define DISP8(ind, disp) (ind*unit_size*8+disp) +#define DISP4(ind, disp) (ind*unit_size*4+disp) +#define DISP2(ind, disp) (ind*unit_size*2+disp) +#define DISP1(ind, disp) (ind*unit_size+disp) +#define DISPX(disp) (disp) + +.macro AGGREGATE_REALS_IMAGES VSINR_OUT1, VSINR, VSINI_OUT2, VSINI +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + xvsubsp \VSINR_OUT1, \VSINR_OUT1, \VSINR + xvaddsp \VSINI_OUT2, \VSINI_OUT2, \VSINI +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + xvaddsp \VSINR_OUT1, \VSINR_OUT1, \VSINR + xvsubsp \VSINI_OUT2, \VSINI_OUT2, \VSINI +#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) + xvaddsp \VSINR_OUT1, \VSINR_OUT1, \VSINR + xvsubsp \VSINI_OUT2, \VSINI, \VSINI_OUT2 +#else // CC || CR || RC || RR + /*we will assume {-alpha_r,-alpha_i} for this case */ + /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ + xvsubsp \VSINR_OUT1, \VSINR, \VSINR_OUT1 + /*we will negate alpha image instead to fix sign*/ + xvaddsp \VSINI_OUT2, \VSINI_OUT2, \VSINI +#endif +.endm + +.macro AGGREGATE_REALS_IMAGES_A_PERMUTE VSINR_OUT1, VSINR, VSINI_OUT2, VSINI +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + xvsubsp \VSINR_OUT1, \VSINR_OUT1, \VSINR + xvaddsp \VSINI_OUT2, \VSINI_OUT2, \VSINI +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + xvaddsp \VSINR_OUT1, \VSINR_OUT1, \VSINR + xvsubsp \VSINI_OUT2, \VSINI, \VSINI_OUT2 +#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) + xvaddsp \VSINR_OUT1, \VSINR_OUT1, \VSINR + xvsubsp \VSINI_OUT2, \VSINI_OUT2, \VSINI +#else // CC || CR || RC || RR + /*we will assume {-alpha_r,-alpha_i} for this case */ + /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ + xvsubsp \VSINR_OUT1, \VSINR, \VSINR_OUT1 + /*we will negate alpha image instead to fix sign*/ + xvaddsp \VSINI_OUT2, \VSINI_OUT2, \VSINI +#endif +.endm + +/* {i0,i1} * {alpha_i,alpha_i} [- VSOUT1] ;[VSOUT2 +] {r0,r1}*{alpha_i,alpha_i} */ + +.macro MULT_APLHA_PART1 VSINRR, VSINII, VSOUT1, VSOUT2 + xvmulsp \VSOUT1, \VSINII, alpha_i + xvmulsp \VSOUT2, \VSINRR, alpha_i +.endm + +/* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */ + +.macro MULT_APLHA_PART2 VSINRR, VSINII, VSOUT1, VSOUT2 + xvmsubasp \VSOUT1, \VSINRR, alpha_r + xvmaddasp \VSOUT2, \VSINII, alpha_r +.endm + +.macro PERMUTE1 OUT, R1, R2, R3, R4 + xxsel vs62, \R1, \R2, vs57 + xxsel \OUT, \R3, \R4, vs57 + xxpermdi \OUT, \OUT, vs62, 1 +.endm +.macro PERMUTE2 OUT, R1, R2, R3, R4 + xxsel vs62, \R2, \R1, vs57 + xxsel \OUT, \R4, \R3, vs57 + xxpermdi \OUT, vs62, \OUT, 1 + xxperm \OUT, \OUT, permute_mask +.endm +.macro PERMUTE3 OUT, R1, R2, R3, R4 + xxsel vs62, \R1, \R2, vs57 + xxsel \OUT, \R3, \R4, vs57 + xxpermdi \OUT, vs62, \OUT, 2 +.endm +.macro PERMUTE4 OUT, R1, R2, R3, R4 + xxsel vs62, \R2, \R1, vs57 + xxsel \OUT, \R4, \R3, vs57 + xxpermdi \OUT, \OUT, vs62, 2 + xxperm \OUT, \OUT, permute_mask +.endm +.macro GROUP1 + xxperm vs0, vs32, permute_mask + xxperm vs4, vs40, permute_mask + xxperm vs1, vs33, permute_mask + xxperm vs5, vs41, permute_mask + xxperm vs8, vs36, permute_mask + xxperm vs12, vs44, permute_mask + xxperm vs9, vs37, permute_mask + xxperm vs13, vs45, permute_mask +.endm +.macro AGG_GROUP1 + AGGREGATE_REALS_IMAGES vs32, vs0, vs40, vs4 + AGGREGATE_REALS_IMAGES vs33, vs1, vs41, vs5 + AGGREGATE_REALS_IMAGES vs36, vs8, vs44, vs12 + AGGREGATE_REALS_IMAGES vs37, vs9, vs45, vs13 +.endm +.macro GROUP2 + xxperm vs0, vs34, permute_mask + xxperm vs4, vs42, permute_mask + xxperm vs1, vs35, permute_mask + xxperm vs5, vs43, permute_mask + xxperm vs8, vs38, permute_mask + xxperm vs12, vs46, permute_mask + xxperm vs9, vs39, permute_mask + xxperm vs13, vs47, permute_mask +.endm +.macro AGG_GROUP2 + AGGREGATE_REALS_IMAGES vs34, vs0, vs42, vs4 + AGGREGATE_REALS_IMAGES vs35, vs1, vs43, vs5 + AGGREGATE_REALS_IMAGES vs38, vs8, vs46, vs12 + AGGREGATE_REALS_IMAGES vs39, vs9, vs47, vs13 +.endm +.macro MULTIPLY_GROUP1 + MULT_APLHA_PART1 vs32, vs40, vs0, vs1 + MULT_APLHA_PART1 vs33, vs41, vs2, vs3 + MULT_APLHA_PART1 vs36, vs44, vs8, vs9 + MULT_APLHA_PART1 vs37, vs45, vs10, vs11 + MULT_APLHA_PART2 vs32, vs40, vs0, vs1 + MULT_APLHA_PART2 vs33, vs41, vs2, vs3 + MULT_APLHA_PART2 vs36, vs44, vs8, vs9 + MULT_APLHA_PART2 vs37, vs45, vs10, vs11 +.endm +.macro MULTIPLY_GROUP2 + MULT_APLHA_PART1 vs34, vs42, vs4, vs5 + MULT_APLHA_PART1 vs35, vs43, vs6, vs7 + MULT_APLHA_PART1 vs38, vs46, vs12, vs13 + MULT_APLHA_PART1 vs39, vs47, vs14, vs15 + MULT_APLHA_PART2 vs34, vs42, vs4, vs5 + MULT_APLHA_PART2 vs35, vs43, vs6, vs7 + MULT_APLHA_PART2 vs38, vs46, vs12, vs13 + MULT_APLHA_PART2 vs39, vs47, vs14, vs15 +.endm +/* reconstruct r, i pairs*/ +.macro RECONSTRUCT_PAIR1 + xxperm vs0, vs1, save_permute_1 + xxperm vs2, vs3, save_permute_1 + xxperm vs8, vs9, save_permute_1 + xxperm vs10, vs11, save_permute_1 +.endm +.macro RECONSTRUCT_PAIR2 + xxperm vs4, vs5, save_permute_1 + xxperm vs6, vs7, save_permute_1 + xxperm vs12, vs13, save_permute_1 + xxperm vs14, vs15, save_permute_1 +.endm +.macro SHUFFLE_ACC ACC, R0, R1, R2, R3, O1, O2, O3, O4 + xxmfacc \ACC + PERMUTE1 \O1, \R3, \R2, \R1, \R0 + PERMUTE2 \O2, \R1, \R0, \R3, \R2 + PERMUTE3 \O3, \R1, \R0, \R3, \R2 + PERMUTE4 \O4, \R3, \R2, \R1, \R0 +.endm +/* macros for N=4 and M=8 +**********************************************************************************************/ +.macro ZERO4x8 + xxsetaccz 0 + xxsetaccz 1 + xxsetaccz 2 + xxsetaccz 3 + xxsetaccz 4 + xxsetaccz 5 + xxsetaccz 6 + xxsetaccz 7 +.endm + +.macro LOAD4x8 + LOAD4x8O 0, 0 +.endm + +.macro LOAD4x8O OffsetA, OffsetB + lxvp vs34, (\OffsetB+0)(BO) + lxvp vs32, (\OffsetA+0)(AO) + lxvp vs36, (\OffsetA+32)(AO) +.endm + +.macro END4x8_NORMAL + END4x8 AO, BO, 64, 32 +.endm + +.macro END4x8_WITHOUT_ADD + END4x8 AO, BO, 0, 0 +.endm + +.macro END4x8 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvf32gerpp 3, 36, 35 + xvf32gerpp 2, 37, 35 + xvf32gerpp 1, 32, 35 + xvf32gerpp 0, 33, 35 + xvf32gerpp 7, 36, 34 + xvf32gerpp 6, 37, 34 + xvf32gerpp 5, 32, 34 + xvf32gerpp 4, 33, 34 +.endm + +.macro LOAD4x8_2 + LOAD4x8_2O 0, 0 +.endm + +.macro LOAD4x8_2O OffsetA, OffsetB + lxvp vs34, (\OffsetB)(BO) + lxvp vs38, (32+\OffsetB)(BO) + lxvp vs32, (0+\OffsetA)(AO) + lxvp vs36, (32+\OffsetA)(AO) + lxvp vs40, (64+\OffsetA)(AO) + lxvp vs42, (64+32+\OffsetA)(AO) +.endm + +.macro END4x8_2 + /*for load2 offset will be 128 and 64*/ + KERNEL4x8_2 AO, BO, 128, 64, 0, 1, 1 +.endm + +.macro KERNEL4x8_E2 OffsetA, OffsetB, Index, IsLast + KERNEL4x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL4x8_L2 OffsetA, OffsetB, Index, IsLast + KERNEL4x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL4x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete + xvf32gerpp 3, 36, 35 + xvf32gerpp 2, 37, 35 + xvf32gerpp 1, 32, 35 + xvf32gerpp 0, 33, 35 + xvf32gerpp 7, 36, 34 + xvf32gerpp 6, 37, 34 + xvf32gerpp 5, 32, 34 + xvf32gerpp 4, 33, 34 +.if \Complete==0 + lxvp vs34, DISP8(\Index, \OffsetB)(\BREG) + lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG) + lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG) +.endif + xvf32gerpp 3, 42, 39 + xvf32gerpp 2, 43, 39 + xvf32gerpp 1, 40, 39 + xvf32gerpp 0, 41, 39 + xvf32gerpp 7, 42, 38 + xvf32gerpp 6, 43, 38 + xvf32gerpp 5, 40, 38 + xvf32gerpp 4, 41, 38 +.if \Complete==0 + lxvp vs40, DISP16(\Index, 64+\OffsetA)(\AREG) + lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG) + lxvp vs42, DISP16(\Index, 64+32+\OffsetA)(\AREG) +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP8(\Index, \OffsetB) + addi \AREG, \AREG, DISP16(\Index, \OffsetA) +.else + addi \BREG, \BREG, DISP8(\Index, 64) + addi \AREG, \AREG, DISP16(\Index, 128) +.endif +.endif +.endm + +.macro KERNEL4x8 + LOAD4x8 + END4x8 AO, BO, 64, 32 +.endm + +.macro SAVE4x8 + SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44 + SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45 + SHUFFLE_ACC 2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46 + SHUFFLE_ACC 3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47 + SHUFFLE_ACC 4, vs16, vs17, vs18, vs19, vs48, vs56, vs52, vs60 + SHUFFLE_ACC 5, vs20, vs21, vs22, vs23, vs49, vs16, vs53, vs61 + SHUFFLE_ACC 7, vs28, vs29, vs30, vs31, vs17, vs19, vs18, vs20 + SHUFFLE_ACC 6, vs24, vs25, vs26, vs27, vs50, vs58, vs54, vs21 + add T4, LDC, LDC + add T1, CO, LDC +#ifndef TRMMKERNEL + lxvp vs24, 0(CO) +#endif +#ifndef TRMMKERNEL + lxvp vs26, 32(CO) +#endif +#ifndef TRMMKERNEL + lxvp vs28, 0(T1) +#endif + xxperm vs2, vs34, permute_mask + xxperm vs6, vs42, permute_mask +#ifndef TRMMKERNEL + lxvp vs30, 32(T1) +#endif + xxperm vs3, vs35, permute_mask + xxperm vs7, vs43, permute_mask + add T2, CO, T4 + add T3, T1, T4 + GROUP1 + AGG_GROUP1 + AGGREGATE_REALS_IMAGES vs34, vs2, vs42, vs6 + xxperm vs10, vs38, permute_mask + xxperm vs14, vs46, permute_mask + AGGREGATE_REALS_IMAGES vs35, vs3, vs43, vs7 + xxperm vs11, vs39, permute_mask + xxperm vs15, vs47, permute_mask + xxperm vs0, vs48, permute_mask + xxperm vs4, vs56, permute_mask + xxperm vs1, vs49, permute_mask + xxperm vs5, vs16, permute_mask + AGGREGATE_REALS_IMAGES vs38, vs10, vs46, vs14 + xxperm vs2, vs50, permute_mask + xxperm vs6, vs58, permute_mask + AGGREGATE_REALS_IMAGES vs39, vs11, vs47, vs15 + xxperm vs3, vs17, permute_mask + xxperm vs7, vs19, permute_mask + AGGREGATE_REALS_IMAGES vs48, vs0, vs56, vs4 + xxperm vs8, vs52, permute_mask + xxperm vs12, vs60, permute_mask + AGGREGATE_REALS_IMAGES vs49, vs1, vs16, vs5 + xxperm vs9, vs53, permute_mask + xxperm vs13, vs61, permute_mask + AGGREGATE_REALS_IMAGES vs50, vs2, vs58, vs6 + xxperm vs10, vs54, permute_mask + xxperm vs14, vs21, permute_mask + AGGREGATE_REALS_IMAGES vs17, vs3, vs19, vs7 + xxperm vs11, vs18, permute_mask + xxperm vs15, vs20, permute_mask + AGGREGATE_REALS_IMAGES vs52, vs8, vs60, vs12 + AGGREGATE_REALS_IMAGES vs53, vs9, vs61, vs13 +/*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULT_APLHA_PART1 vs32, vs40, vs0, vs1 + AGGREGATE_REALS_IMAGES vs54, vs10, vs21, vs14 + MULT_APLHA_PART1 vs33, vs41, vs2, vs3 + AGGREGATE_REALS_IMAGES vs18, vs11, vs20, vs15 + MULT_APLHA_PART1 vs34, vs42, vs4, vs5 + MULT_APLHA_PART1 vs35, vs43, vs6, vs7 + MULT_APLHA_PART2 vs32, vs40, vs0, vs1 + MULT_APLHA_PART2 vs33, vs41, vs2, vs3 + MULT_APLHA_PART2 vs34, vs42, vs4, vs5 + MULT_APLHA_PART2 vs35, vs43, vs6, vs7 +#ifndef TRMMKERNEL + lxvp vs32, 0(T2) +#endif + MULT_APLHA_PART1 vs36, vs44, vs8, vs9 + MULT_APLHA_PART1 vs37, vs45, vs10, vs11 +#ifndef TRMMKERNEL + lxvp vs40, 32(T2) +#endif + MULT_APLHA_PART1 vs38, vs46, vs12, vs13 + MULT_APLHA_PART1 vs39, vs47, vs14, vs15 +#ifndef TRMMKERNEL + lxvp vs34, 0(T3) +#endif + MULT_APLHA_PART2 vs36, vs44, vs8, vs9 + MULT_APLHA_PART2 vs37, vs45, vs10, vs11 +#ifndef TRMMKERNEL + lxvp vs42, 32(T3) +#endif + MULT_APLHA_PART2 vs38, vs46, vs12, vs13 + MULT_APLHA_PART2 vs39, vs47, vs14, vs15 + RECONSTRUCT_PAIR1 + RECONSTRUCT_PAIR2 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1, vs8, vs0, 2 + xxpermdi vs3, vs10, vs2, 2 + xxpermdi vs5, vs12, vs4, 2 + xxpermdi vs7, vs14, vs6, 2 + xxpermdi vs9, vs0, vs8, 2 + xxpermdi vs11, vs2, vs10, 2 + xvaddsp vs24, vs24, vs3 + xvaddsp vs25, vs25, vs1 + xxpermdi vs13, vs4, vs12, 2 + xxpermdi vs15, vs6, vs14, 2 + xvaddsp vs26, vs26, vs7 + xvaddsp vs27, vs27, vs5 + xvaddsp vs28, vs28, vs11 + xvaddsp vs29, vs29, vs9 + xvaddsp vs30, vs30, vs15 + xvaddsp vs31, vs31, vs13 +#else + xxpermdi vs25, vs8, vs0, 2 + xxpermdi vs24, vs10, vs2, 2 + xxpermdi vs27, vs12, vs4, 2 + xxpermdi vs26, vs14, vs6, 2 + xxpermdi vs29, vs0, vs8, 2 + xxpermdi vs28, vs2, vs10, 2 + xxpermdi vs31, vs4, vs12, 2 + xxpermdi vs30, vs6, vs14, 2 +#endif + stxvp vs24, 0(CO) + MULT_APLHA_PART1 vs48, vs56, vs0, vs1 + MULT_APLHA_PART1 vs49, vs16, vs2, vs3 + stxvp vs26, 32(CO) + MULT_APLHA_PART1 vs50, vs58, vs4, vs5 + MULT_APLHA_PART1 vs17, vs19, vs6, vs7 + stxvp vs28, 0(T1) + MULT_APLHA_PART2 vs48, vs56, vs0, vs1 + MULT_APLHA_PART2 vs49, vs16, vs2, vs3 + stxvp vs30, 32(T1) + MULT_APLHA_PART2 vs50, vs58, vs4, vs5 + MULT_APLHA_PART2 vs17, vs19, vs6, vs7 + MULT_APLHA_PART1 vs52, vs60, vs8, vs9 + MULT_APLHA_PART1 vs53, vs61, vs10, vs11 + MULT_APLHA_PART1 vs54, vs21, vs12, vs13 + MULT_APLHA_PART1 vs18, vs20, vs14, vs15 + MULT_APLHA_PART2 vs52, vs60, vs8, vs9 + MULT_APLHA_PART2 vs53, vs61, vs10, vs11 + MULT_APLHA_PART2 vs54, vs21, vs12, vs13 + MULT_APLHA_PART2 vs18, vs20, vs14, vs15 + RECONSTRUCT_PAIR1 + RECONSTRUCT_PAIR2 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1, vs8, vs0, 2 + xxpermdi vs3, vs10, vs2, 2 + xxpermdi vs5, vs12, vs4, 2 + xxpermdi vs7, vs14, vs6, 2 + xxpermdi vs9, vs0, vs8, 2 + xxpermdi vs11, vs2, vs10, 2 + xvaddsp vs32, vs32, vs3 + xvaddsp vs33, vs33, vs1 + xxpermdi vs13, vs4, vs12, 2 + xxpermdi vs15, vs6, vs14, 2 + xvaddsp vs40, vs40, vs7 + xvaddsp vs41, vs41, vs5 + xvaddsp vs34, vs34, vs11 + xvaddsp vs35, vs35, vs9 + xvaddsp vs42, vs42, vs15 + xvaddsp vs43, vs43, vs13 +#else + xxpermdi vs33, vs8, vs0, 2 + xxpermdi vs32, vs10, vs2, 2 + xxpermdi vs41, vs12, vs4, 2 + xxpermdi vs40, vs14, vs6, 2 + xxpermdi vs35, vs0, vs8, 2 + xxpermdi vs34, vs2, vs10, 2 + xxpermdi vs43, vs4, vs12, 2 + xxpermdi vs42, vs6, vs14, 2 +#endif + stxvp vs32, 0(T2) + stxvp vs40, 32(T2) + stxvp vs34, 0(T3) + stxvp vs42, 32(T3) + addi CO, CO, 64 +.endm + +/* macros for N=4 and M=4 +**********************************************************************************************/ + +.macro ZERO4x4 + xxsetaccz 0 + xxsetaccz 1 + xxsetaccz 2 + xxsetaccz 3 +.endm + +.macro LOAD4x4 + LOAD4x4O 0, 0 +.endm + +.macro LOAD4x4O OffsetA, OffsetB + lxvp vs34, (\OffsetB+0)(BO) + lxvp vs32, (\OffsetA+0)(AO) +.endm + +.macro END4x4_NORMAL + END4x4 AO, BO, 32, 32 +.endm + +.macro END4x4_WITHOUT_ADD + END4x4 AO, BO, 0, 0 +.endm + +.macro END4x4 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvf32gerpp 3, 32, 34 + xvf32gerpp 2, 33, 34 + xvf32gerpp 1, 32, 35 + xvf32gerpp 0, 33, 35 +.endm + +.macro LOAD4x4_2 + LOAD4x4_2O 0, 0 +.endm + +.macro LOAD4x4_2O OffsetA, OffsetB + lxvp vs34, (\OffsetB)(BO) + lxvp vs38, (32+\OffsetB)(BO) + lxvp vs32, (0+\OffsetA)(AO) + lxvp vs36, (32+\OffsetA)(AO) +.endm + +.macro END4x4_2 + /*for load2 offset will be 64 and 64*/ + KERNEL4x4_2 AO, BO, 64, 64, 0, 1, 1 +.endm + +.macro KERNEL4x4_E2 OffsetA, OffsetB, Index, IsLast + KERNEL4x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL4x4_L2 OffsetA, OffsetB, Index, IsLast + KERNEL4x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL4x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete + xvf32gerpp 3, 32, 34 + xvf32gerpp 2, 33, 34 + xvf32gerpp 1, 32, 35 + xvf32gerpp 0, 33, 35 +.if \Complete==0 + lxvp vs34, DISP8(\Index, \OffsetB)(\BREG) + lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG) +.endif + xvf32gerpp 3, 36, 38 + xvf32gerpp 2, 37, 38 + xvf32gerpp 1, 36, 39 + xvf32gerpp 0, 37, 39 +.if \Complete==0 + lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG) + lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG) +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP8(\Index, \OffsetB) + addi \AREG, \AREG, DISP8(\Index, \OffsetA) +.else + addi \BREG, \BREG, DISP8(\Index, 64) + addi \AREG, \AREG, DISP8(\Index, 64) +.endif +.endif +.endm + +.macro KERNEL4x4 + LOAD4x4 + END4x4 AO, BO, 32, 32 +.endm + +.macro SAVE4x4 + SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44 + SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45 + SHUFFLE_ACC 2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46 + SHUFFLE_ACC 3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47 + add T4, LDC, LDC + add T1, CO, LDC +#ifndef TRMMKERNEL + lxvp vs24, 0(CO) +#endif + add T2, CO, T4 + add T3, T1, T4 +#ifndef TRMMKERNEL + lxvp vs26, 0(T1) +#endif + #ifndef TRMMKERNEL + lxvp vs28, 0(T2) +#endif +#ifndef TRMMKERNEL + lxvp vs30, 0(T3) +#endif + GROUP1 + AGG_GROUP1 + GROUP2 + AGG_GROUP2 + /*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULTIPLY_GROUP1 + MULTIPLY_GROUP2 +/* reconstruct r, i pairs*/ + RECONSTRUCT_PAIR1 + RECONSTRUCT_PAIR2 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1, vs8, vs0, 2 + xxpermdi vs3, vs10, vs2, 2 + xxpermdi vs9, vs0, vs8, 2 + xxpermdi vs11, vs2, vs10, 2 + xxpermdi vs5, vs12, vs4, 2 + xxpermdi vs7, vs14, vs6, 2 + xxpermdi vs13, vs4, vs12, 2 + xxpermdi vs15, vs6, vs14, 2 + xvaddsp vs24, vs24, vs3 + xvaddsp vs25, vs25, vs1 + xvaddsp vs26, vs26, vs11 + xvaddsp vs27, vs27, vs9 + xvaddsp vs28, vs28, vs7 + xvaddsp vs29, vs29, vs5 + xvaddsp vs30, vs30, vs15 + xvaddsp vs31, vs31, vs13 +#else + xxpermdi vs25, vs8, vs0, 2 + xxpermdi vs24, vs10, vs2, 2 + xxpermdi vs27, vs0, vs8, 2 + xxpermdi vs26, vs2, vs10, 2 + xxpermdi vs29, vs12, vs4, 2 + xxpermdi vs28, vs14, vs6, 2 + xxpermdi vs31, vs4, vs12, 2 + xxpermdi vs30, vs6, vs14, 2 +#endif + stxvp vs24, 0(CO) + stxvp vs26, 0(T1) + stxvp vs28, 0(T2) + stxvp vs30, 0(T3) + addi CO, CO, 32 +.endm + +/* macros for N=4 and M=2 +**********************************************************************************************/ + +.macro ZERO4x2 + xxsetaccz 0 + xxsetaccz 1 +.endm + +.macro LOAD4x2 + LOAD4x2O 0, 0 +.endm + +.macro LOAD4x2O OffsetA, OffsetB + lxv vs32, (\OffsetA+0)(AO) + lxvp vs34, (\OffsetB+0)(BO) +.endm + +.macro END4x2_NORMAL + END4x2 AO, BO, 16, 32 +.endm + +.macro END4x2_WITHOUT_ADD + END4x2 AO, BO, 0, 0 +.endm + +.macro END4x2 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvf32gerpp 1, 34, 32 + xvf32gerpp 0, 35, 32 +.endm + +.macro LOAD4x2_2 + LOAD4x2_2O 0, 0 +.endm + +.macro LOAD4x2_2O OffsetA, OffsetB + lxvp vs32, (\OffsetA)(AO) + lxvp vs34, (0+\OffsetB)(BO) + lxvp vs36, (32+\OffsetB)(BO) +.endm + +.macro END4x2_2 + /*for load2 offset will be 32 and 64*/ + KERNEL4x2_2 AO, BO, 32, 64, 0, 1, 1 +.endm + +.macro KERNEL4x2_E2 OffsetA, OffsetB, Index, IsLast + KERNEL4x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL4x2_L2 OffsetA, OffsetB, Index, IsLast + KERNEL4x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL4x2_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete + xvf32gerpp 1, 34, 33 + xvf32gerpp 0, 35, 33 +.if \Complete==0 + lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG) +.endif + xvf32gerpp 1, 36, 32 + xvf32gerpp 0, 37, 32 +.if \Complete==0 + lxvp vs32, DISP4(\Index, \OffsetA)(\AREG) + lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG) +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP4(\Index, \OffsetA) + addi \BREG, \BREG, DISP8(\Index, \OffsetB) +.else + addi \AREG, \AREG, DISP4(\Index, 32) + addi \BREG, \BREG, DISP8(\Index, 64) +.endif +.endif +.endm + +.macro KERNEL4x2 + LOAD4x2 + END4x2 AO, BO, 16, 32 +.endm + +.macro SAVE4x2 + SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44 + SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45 + add T4, LDC, LDC + add T1, CO, LDC + add T2, CO, T4 + add T3, T1, T4 +#ifndef TRMMKERNEL + lxv vs24, 0(CO) +#endif +#ifndef TRMMKERNEL + lxv vs25, 0(T1) +#endif +#ifndef TRMMKERNEL + lxv vs26, 0(T2) +#endif +#ifndef TRMMKERNEL + lxv vs27, 0(T3) +#endif + GROUP1 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs33, vs1, vs41, vs5 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs36, vs8, vs44, vs12 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs37, vs9, vs45, vs13 + /*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULTIPLY_GROUP1 +/* reconstruct r, i pairs*/ + RECONSTRUCT_PAIR1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1, vs8, vs0, 0 + xxpermdi vs9, vs10, vs2, 0 + xxpermdi vs3, vs0, vs8, 3 + xxpermdi vs11, vs2, vs10, 3 + xvaddsp vs24, vs24, vs1 + xvaddsp vs26, vs26, vs9 + xvaddsp vs25, vs25, vs3 + xvaddsp vs27, vs27, vs11 +#else + xxpermdi vs24, vs8, vs0, 0 + xxpermdi vs26, vs10, vs2, 0 + xxpermdi vs25, vs0, vs8, 3 + xxpermdi vs27, vs2, vs10, 3 +#endif + stxv vs24, 0(CO) + stxv vs25, 0(T1) + stxv vs26, 0(T2) + stxv vs27, 0(T3) + addi CO, CO, 16 +.endm + +/* macros for N=4 and M=2 +**********************************************************************************************/ + +.macro ZERO4x1 + xxsetaccz 0 + xxsetaccz 1 +.endm + +.macro LOAD4x1 + LOAD4x1O 0, 0 +.endm + +.macro LOAD4x1O OffsetA, OffsetB + lxsd v0, (\OffsetA+0)(AO) + lxvp vs34, (\OffsetB+0)(BO) +.endm + +.macro END4x1_NORMAL + END4x1 AO, BO,8, 32 +.endm + +.macro END4x1_WITHOUT_ADD + END4x1 AO, BO, 0, 0 +.endm + +.macro END4x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvf32gerpp 0, 35, 32 + xvf32gerpp 1, 34, 32 +.endm + +.macro LOAD4x1_2 + LOAD4x1_2O 0, 0 +.endm + +.macro LOAD4x1_2O OffsetA, OffsetB + lxv vs32, (\OffsetA)(AO) + vspltisb v6, 0 + xxpermdi vs33, vs32, vs38, 0 + xxpermdi vs32, vs32, vs38, 2 + lxvp vs34, (0+\OffsetB)(BO) + lxvp vs36, (32+\OffsetB)(BO) +.endm + +.macro END4x1_2 + /*for load2 offset will be 16 and 64*/ + KERNEL4x1_2 AO, BO, 16, 64, 0, 1, 1 +.endm + +.macro KERNEL4x1_E2 OffsetA, OffsetB, Index, IsLast + KERNEL4x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL4x1_L2 OffsetA, OffsetB, Index, IsLast + KERNEL4x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL4x1_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete + xvf32gerpp 0, 35, 32 + xvf32gerpp 1, 34, 32 +.if \Complete==0 + lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG) +.endif + xvf32gerpp 0, 37, 33 + xvf32gerpp 1, 36, 33 +.if \Complete==0 + lxv vs32, DISP2(\Index, \OffsetA)(\AREG) + lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG) + xxpermdi vs33, vs32, vs38, 0 + xxpermdi vs32, vs32, vs38, 2 +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index, \OffsetA) + addi \BREG, \BREG, DISP8(\Index, \OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index, 16) + addi \BREG, \BREG, DISP8(\Index, 64) +.endif +.endif +.endm + +.macro KERNEL4x1 + LOAD4x1 + END4x1 AO, BO, 8, 32 +.endm + +.macro SAVE4x1 + SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44 + SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45 + xxpermdi vs32, vs32, vs36, 1 + xxpermdi vs40, vs40, vs44, 1 + xxpermdi vs33, vs33, vs37, 1 + xxpermdi vs41, vs41, vs45, 1 + add T4, LDC, LDC + add T1, CO, LDC + add T2, CO, T4 + add T3, T1, T4 +#ifndef TRMMKERNEL + lxsd v4, 0(CO) +#endif +#ifndef TRMMKERNEL + lxsd v5, 0(T1) +#endif +#ifndef TRMMKERNEL + lxsd v6, 0(T2) +#endif +#ifndef TRMMKERNEL + lxsd v7, 0(T3) +#endif + xxperm vs0, vs32, permute_mask + xxperm vs4, vs40, permute_mask + xxperm vs1, vs33, permute_mask + xxperm vs5, vs41, permute_mask + AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs33, vs1, vs41, vs5 + /*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULT_APLHA_PART1 vs32, vs40, vs0, vs1 + MULT_APLHA_PART1 vs33, vs41, vs2, vs3 + MULT_APLHA_PART2 vs32, vs40, vs0, vs1 + MULT_APLHA_PART2 vs33, vs41, vs2, vs3 +/* reconstruct r, i pairs*/ + xxperm vs0, vs1, save_permute_1 + xxperm vs2, vs3, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxspltd vs1, vs0, 0 + xxspltd vs3, vs0, 1 + xxspltd vs9, vs2, 0 + xxspltd vs11, vs2, 1 + /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/ + xvaddsp vs36, vs36, vs1 + xvaddsp vs37, vs37, vs3 + xvaddsp vs38, vs38, vs9 + xvaddsp vs39, vs39, vs11 +#else + /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/ + xxspltd vs36, vs0, 0 + xxspltd vs37, vs0, 1 + xxspltd vs38, vs2, 0 + xxspltd vs39, vs2, 1 +#endif + stxsd v4, 0(CO) + stxsd v5, 0(T1) + stxsd v6, 0(T2) + stxsd v7, 0(T3) + addi CO, CO, 8 +.endm + +/* macros for N=2 and M=8 +**********************************************************************************************/ + +.macro ZERO2x8 + xxsetaccz 0 + xxsetaccz 1 + xxsetaccz 2 + xxsetaccz 3 +.endm + +.macro LOAD2x8 + LOAD2x8O 0, 0 +.endm + +.macro LOAD2x8O OffsetA, OffsetB + lxv vs34, (\OffsetB+0)(BO) + lxvp vs32, (\OffsetA+0)(AO) + lxvp vs36, (\OffsetA+32)(AO) +.endm + +.macro END2x8_NORMAL + END2x8 AO, BO, 64, 16 +.endm + +.macro END2x8_WITHOUT_ADD + END2x8 AO, BO, 0, 0 +.endm + +.macro END2x8 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvf32gerpp 2, 37, 34 + xvf32gerpp 3, 36, 34 + xvf32gerpp 0, 33, 34 + xvf32gerpp 1, 32, 34 +.endm + +.macro LOAD2x8_2 + LOAD2x8_2O 0, 0 +.endm + +.macro LOAD2x8_2O OffsetA, OffsetB + lxvp vs34, (\OffsetB)(BO) + lxvp vs32, (0+\OffsetA)(AO) + lxvp vs36, (32+\OffsetA)(AO) + lxvp vs38, (64+\OffsetA)(AO) + lxvp vs40, (64+32+\OffsetA)(AO) +.endm + +.macro END2x8_2 + /*for load2 offset will be 128 and 32*/ + KERNEL2x8_2 AO, BO, 128, 32, 0, 1, 1 +.endm + +.macro KERNEL2x8_E2 OffsetA, OffsetB, Index, IsLast + KERNEL2x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL2x8_L2 OffsetA, OffsetB, Index, IsLast + KERNEL2x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL2x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete + xvf32gerpp 2, 37, 35 + xvf32gerpp 3, 36, 35 + xvf32gerpp 0, 33, 35 + xvf32gerpp 1, 32, 35 + +.if \Complete==0 + lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG) + lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG) +.endif + xvf32gerpp 2, 41, 34 + xvf32gerpp 3, 40, 34 + xvf32gerpp 0, 39, 34 + xvf32gerpp 1, 38, 34 + +.if \Complete==0 + lxvp vs34, DISP4(\Index, \OffsetB)(\BREG) + lxvp vs38, DISP16(\Index, 64+\OffsetA)(\AREG) + lxvp vs40, DISP16(\Index, 64+32+\OffsetA)(\AREG) +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP4(\Index, \OffsetB) + addi \AREG, \AREG, DISP16(\Index, \OffsetA) +.else + addi \BREG, \BREG, DISP4(\Index, 32) + addi \AREG, \AREG, DISP16(\Index, 128) +.endif +.endif +.endm + +.macro KERNEL2x8 + LOAD2x8 + END2x8 AO, BO, 64, 16 +.endm + +.macro SAVE2x8 + SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44 + SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45 + SHUFFLE_ACC 2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46 + SHUFFLE_ACC 3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47 + add T1, CO, LDC +#ifndef TRMMKERNEL + lxvp vs24, 0(CO) +#endif +#ifndef TRMMKERNEL + lxvp vs26, 32(CO) +#endif +#ifndef TRMMKERNEL + lxvp vs28, 0(T1) +#endif +#ifndef TRMMKERNEL + lxvp vs30, 32(T1) +#endif + add T2, CO, T4 + add T3, T1, T4 + GROUP1 + AGG_GROUP1 + GROUP2 + AGG_GROUP2 + /*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULTIPLY_GROUP1 + MULTIPLY_GROUP2 +/* reconstruct r, i pairs*/ + RECONSTRUCT_PAIR1 + RECONSTRUCT_PAIR2 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1, vs8, vs0, 2 + xxpermdi vs3, vs10, vs2, 2 + xxpermdi vs5, vs12, vs4, 2 + xxpermdi vs7, vs14, vs6, 2 + xxpermdi vs9, vs0, vs8, 2 + xxpermdi vs11, vs2, vs10, 2 + xvaddsp vs24, vs24, vs3 + xvaddsp vs25, vs25, vs1 + xxpermdi vs13, vs4, vs12, 2 + xxpermdi vs15, vs6, vs14, 2 + xvaddsp vs26, vs26, vs7 + xvaddsp vs27, vs27, vs5 + xvaddsp vs28, vs28, vs11 + xvaddsp vs29, vs29, vs9 + xvaddsp vs30, vs30, vs15 + xvaddsp vs31, vs31, vs13 +#else + xxpermdi vs25, vs8, vs0, 2 + xxpermdi vs24, vs10, vs2, 2 + xxpermdi vs27, vs12, vs4, 2 + xxpermdi vs26, vs14, vs6, 2 + xxpermdi vs29, vs0, vs8, 2 + xxpermdi vs28, vs2, vs10, 2 + xxpermdi vs31, vs4, vs12, 2 + xxpermdi vs30, vs6, vs14, 2 +#endif + stxvp vs24, 0(CO) + stxvp vs26, 32(CO) + stxvp vs28, 0(T1) + stxvp vs30, 32(T1) + addi CO, CO, 64 +.endm + +/* macros for N=2 and M=4 +**********************************************************************************************/ + +.macro ZERO2x4 + xxsetaccz 0 + xxsetaccz 1 +.endm + +.macro LOAD2x4 + LOAD2x4O 0, 0 +.endm + +.macro LOAD2x4O OffsetA, OffsetB + lxv vs34, (\OffsetB+0)(BO) + lxvp vs32, (\OffsetA+0)(AO) +.endm + +.macro END2x4_NORMAL + END2x4 AO, BO, 32, 16 +.endm + +.macro END2x4_WITHOUT_ADD + END2x4 AO, BO, 0, 0 +.endm + +.macro END2x4 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvf32gerpp 0, 33, 34 + xvf32gerpp 1, 32, 34 +.endm + +.macro LOAD2x4_2 + LOAD2x4_2O 0, 0 +.endm + +.macro LOAD2x4_2O OffsetA, OffsetB + lxvp vs34, (\OffsetB)(BO) + lxvp vs32, (0+\OffsetA)(AO) + lxvp vs36, (32+\OffsetA)(AO) +.endm + +.macro END2x4_2 + /*for load2 offset will be 64 and 32*/ + KERNEL2x4_2 AO, BO, 64, 32, 0, 1, 1 +.endm + +.macro KERNEL2x4_E2 OffsetA, OffsetB, Index, IsLast + KERNEL2x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL2x4_L2 OffsetA, OffsetB, Index, IsLast + KERNEL2x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL2x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete + xvf32gerpp 0, 33, 35 + xvf32gerpp 1, 32, 35 +.if \Complete==0 + lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG) +.endif + xvf32gerpp 0, 37, 34 + xvf32gerpp 1, 36, 34 +.if \Complete==0 + lxvp vs34, DISP4(\Index, \OffsetB)(\BREG) + lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG) +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP4(\Index, \OffsetB) + addi \AREG, \AREG, DISP8(\Index, \OffsetA) +.else + addi \BREG, \BREG, DISP4(\Index, 32) + addi \AREG, \AREG, DISP8(\Index, 64) +.endif +.endif +.endm + +.macro KERNEL2x4 + LOAD2x4 + END2x4 AO, BO, 32, 16 +.endm + +.macro SAVE2x4 + SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44 + SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45 + add T1, CO, LDC +#ifndef TRMMKERNEL + lxvp vs24, 0(CO) +#endif +#ifndef TRMMKERNEL + lxvp vs26, 0(T1) +#endif + GROUP1 + AGG_GROUP1 + /*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULTIPLY_GROUP1 +/* reconstruct r, i pairs*/ + RECONSTRUCT_PAIR1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1, vs8, vs0, 2 + xxpermdi vs3, vs10, vs2, 2 + xxpermdi vs9, vs0, vs8, 2 + xxpermdi vs11, vs2, vs10, 2 + xvaddsp vs24, vs24, vs3 + xvaddsp vs25, vs25, vs1 + xvaddsp vs26, vs26, vs11 + xvaddsp vs27, vs27, vs9 +#else + xxpermdi vs25, vs8, vs0, 2 + xxpermdi vs24, vs10, vs2, 2 + xxpermdi vs27, vs0, vs8, 2 + xxpermdi vs26, vs2, vs10, 2 +#endif + stxvp vs24, 0(CO) + stxvp vs26, 0(T1) + addi CO, CO, 32 +.endm + +/* macros for N=2 and M=2 +**********************************************************************************************/ + +.macro ZERO2x2 + xxsetaccz 0 +.endm + +.macro LOAD2x2 + LOAD2x2O 0, 0 +.endm + +.macro LOAD2x2O OffsetA, OffsetB + lxv vs32, (\OffsetA+0)(AO) + lxv vs34, (\OffsetB+0)(BO) +.endm + +.macro END2x2_NORMAL + END2x2 AO, BO, 16, 16 +.endm + +.macro END2x2_WITHOUT_ADD + END2x2 AO, BO, 0, 0 +.endm + +.macro END2x2 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvf32gerpp 0, 34, 32 +.endm + +.macro LOAD2x2_2 + LOAD2x2_2O 0, 0 +.endm + +.macro LOAD2x2_2O OffsetA, OffsetB + lxvp vs32, (\OffsetA)(AO) + lxvp vs34, (0+\OffsetB)(BO) +.endm + +.macro END2x2_2 + /*for load2 offset will be 32 and 32*/ + KERNEL2x2_2 AO, BO, 32, 32, 0, 1, 1 +.endm + +.macro KERNEL2x2_E2 OffsetA, OffsetB, Index, IsLast + KERNEL2x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL2x2_L2 OffsetA, OffsetB, Index, IsLast + KERNEL2x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL2x2_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete + xvf32gerpp 0, 34, 32 + xvf32gerpp 0, 35, 33 +.if \Complete==0 + lxvp vs32, DISP4(\Index, \OffsetA)(\AREG) + lxvp vs34, DISP4(\Index, \OffsetA)(\BREG) +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP4(\Index, \OffsetA) + addi \BREG, \BREG, DISP4(\Index, \OffsetB) +.else + addi \AREG, \AREG, DISP4(\Index, 32) + addi \BREG, \BREG, DISP4(\Index, 32) +.endif +.endif +.endm + +.macro KERNEL2x2 + LOAD2x2 + END2x2 AO, BO, 16, 16 +.endm + +.macro SAVE2x2 + SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44 + add T1, CO, LDC +#ifndef TRMMKERNEL + lxv vs24, 0(CO) +#endif +#ifndef TRMMKERNEL + lxv vs26, 0(T1) +#endif + xxperm vs0, vs32, permute_mask + xxperm vs4, vs40, permute_mask + xxperm vs8, vs36, permute_mask + xxperm vs12, vs44, permute_mask + AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs36, vs8, vs44, vs12 + /*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULT_APLHA_PART1 vs32, vs40, vs0, vs1 + MULT_APLHA_PART1 vs36, vs44, vs8, vs9 + MULT_APLHA_PART2 vs32, vs40, vs0, vs1 + MULT_APLHA_PART2 vs36, vs44, vs8, vs9 +/* reconstruct r, i pairs*/ + xxperm vs0, vs1, save_permute_1 + xxperm vs8, vs9, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1, vs8, vs0, 0 + xxpermdi vs9, vs0, vs8, 3 + xvaddsp vs24, vs24, vs1 + xvaddsp vs26, vs26, vs9 +#else + xxpermdi vs24, vs8, vs0, 0 + xxpermdi vs26, vs0, vs8, 3 +#endif + stxv vs24, 0(CO) + stxv vs26, 0(T1) + addi CO, CO, 16 +.endm + +/* macros for N=2 and M=1 +**********************************************************************************************/ + +.macro ZERO2x1 + xxlxor vs32, vs32, vs32 + xxlxor vs40, vs40, vs40 +.endm + +.macro LOAD2x1 + LOAD2x1O 0, 0 +.endm + +.macro LOAD2x1O OffsetA, OffsetB + lxsd v4, (\OffsetA+0)(AO) + lxv vs0, (\OffsetB+0)(BO) + xxspltd vs24, vs36, 0 + xxperm vs26, vs24, permute_mask +.endm + +.macro END2x1_NORMAL + END2x1 AO, BO,8, 16 +.endm + +.macro END2x1_WITHOUT_ADD + END2x1 AO, BO, 0, 0 +.endm + +.macro END2x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddasp vs32, vs0, vs24 + xvmaddasp vs40, vs0, vs26 +.endm + +.macro LOAD2x1_2 + LOAD2x1_2O 0, 0 +.endm + +.macro LOAD2x1_2O OffsetA, OffsetB + lxv vs27, (\OffsetA)(AO) + lxvp vs4, (0+\OffsetB)(BO) + xxspltd vs8, vs27, 1 + xxspltd vs24, vs27, 0 + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask +.endm + +.macro END2x1_2 + /*for load2 offset will be 16 and 32*/ + KERNEL2x1_2 AO, BO, 16, 32, 0, 1, 1 +.endm + +.macro KERNEL2x1_E2 OffsetA, OffsetB, Index, IsLast + KERNEL2x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL2x1_L2 OffsetA, OffsetB, Index, IsLast + KERNEL2x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL2x1_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete + xvmaddasp vs32, vs5, vs8 + xvmaddasp vs40, vs5, vs10 +.if \Complete==0 + lxv vs27, DISP2(\Index, \OffsetA)(\AREG) + xxspltd vs8, vs27, 1 +.endif +.if \Complete==0 + xxperm vs10, vs8, permute_mask +.endif + xvmaddasp vs32, vs4, vs24 + xvmaddasp vs40, vs4, vs26 +.if \Complete==0 + xxspltd vs24, vs27, 0 + xxperm vs26, vs24, permute_mask +.endif +.if \Complete==0 + lxvp vs4, DISP4(\Index, 0+\OffsetB)(\BREG) +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index, \OffsetA) + addi \BREG, \BREG, DISP4(\Index, \OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index, 16) + addi \BREG, \BREG, DISP4(\Index, 32) +.endif +.endif +.endm + +.macro KERNEL2x1 + LOAD2x1 + END2x1 AO, BO, 8, 16 +.endm + +.macro SAVE2x1 + add T1, CO, LDC +#ifndef TRMMKERNEL + lxsd v4, 0(CO) +#endif +#ifndef TRMMKERNEL + lxsd v5, 0(T1) +#endif + xxperm vs0, vs32, permute_mask + xxperm vs4, vs40, permute_mask + AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs33, vs1, vs41, vs5 + /*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULT_APLHA_PART1 vs32, vs40, vs0, vs1 + MULT_APLHA_PART2 vs32, vs40, vs0, vs1 +/* reconstruct r, i pairs*/ + xxperm vs0, vs1, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxspltd vs1, vs0, 0 + xxspltd vs3, vs0, 1 + /*--v4==vs36 v5==vs37---*/ + xvaddsp vs36, vs36, vs1 + xvaddsp vs37, vs37, vs3 +#else + /*--v4==vs36 v5==vs37---*/ + xxspltd vs36, vs0, 0 + xxspltd vs37, vs0, 1 +#endif + stxsd v4, 0(CO) + stxsd v5, 0(T1) + addi CO, CO, 8 +.endm + +/* macros for N=1 and M=8 +**********************************************************************************************/ + +.macro ZERO1x8 + xxsetaccz 0 + xxsetaccz 1 + xxsetaccz 2 + xxsetaccz 3 +.endm + +.macro LOAD1x8 + LOAD1x8O 0, 0 +.endm + +.macro LOAD1x8O OffsetA, OffsetB + lxsd v2, (\OffsetB+0)(BO) + lxvp vs32, (\OffsetA+0)(AO) + lxvp vs36, (\OffsetA+32)(AO) +.endm + +.macro END1x8_NORMAL + END1x8 AO, BO, 64,8 +.endm + +.macro END1x8_WITHOUT_ADD + END1x8 AO, BO, 0, 0 +.endm + +.macro END1x8 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvf32gerpp 0, 34, 33 + xvf32gerpp 1, 34, 32 + xvf32gerpp 2, 34, 37 + xvf32gerpp 3, 34, 36 +.endm + +.macro LOAD1x8_2 + LOAD1x8_2O 0, 0 +.endm + +.macro LOAD1x8_2O OffsetA, OffsetB + lxv vs34, (\OffsetB)(BO) + lxvp vs32, (0+\OffsetA)(AO) + lxvp vs36, (32+\OffsetA)(AO) + vspltisb v10, 0 + xxpermdi vs35, vs34, vs42, 0 + xxpermdi vs34, vs34, vs42, 2 + lxvp vs38, (64+\OffsetA)(AO) + lxvp vs40, (64+32+\OffsetA)(AO) +.endm + +.macro END1x8_2 + /*for load2 offset will be 128 and 16*/ + KERNEL1x8_2 AO, BO, 128, 16, 0, 1, 1 +.endm + +.macro KERNEL1x8_E2 OffsetA, OffsetB, Index, IsLast + KERNEL1x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL1x8_L2 OffsetA, OffsetB, Index, IsLast + KERNEL1x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL1x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete + xvf32gerpp 0, 34, 33 + xvf32gerpp 1, 34, 32 +.if \Complete==0 + lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG) +.endif + xvf32gerpp 2, 34, 37 + xvf32gerpp 3, 34, 36 +.if \Complete==0 + lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG) +.endif + xvf32gerpp 0, 35, 39 + xvf32gerpp 1, 35, 38 +.if \Complete==0 + lxvp vs38, DISP16(\Index, 64+\OffsetA)(\AREG) +.endif + xvf32gerpp 2, 35, 41 + xvf32gerpp 3, 35, 40 +.if \Complete==0 + lxv vs34, DISP2(\Index, \OffsetB)(\BREG) + xxpermdi vs35, vs34, vs42, 0 + xxpermdi vs34, vs34, vs42, 2 + lxvp vs40, DISP16(\Index, 64+32+\OffsetA)(\AREG) +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP2(\Index, \OffsetB) + addi \AREG, \AREG, DISP16(\Index, \OffsetA) +.else + addi \BREG, \BREG, DISP2(\Index, 16) + addi \AREG, \AREG, DISP16(\Index, 128) +.endif +.endif +.endm + +.macro KERNEL1x8 + LOAD1x8 + END1x8 AO, BO, 64,8 +.endm + +.macro SAVE1x8 + SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44 + SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45 + SHUFFLE_ACC 2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46 + SHUFFLE_ACC 3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47 + xxpermdi vs32, vs32, vs36, 0 + xxpermdi vs33, vs33, vs37, 0 + xxpermdi vs34, vs34, vs38, 0 + xxpermdi vs35, vs35, vs39, 0 + xxpermdi vs40, vs40, vs44, 0 + xxperm vs40, vs40, permute_mask + xxpermdi vs41, vs41, vs45, 0 + xxperm vs41, vs41, permute_mask + xxpermdi vs42, vs42, vs46, 0 + xxperm vs42, vs42, permute_mask + xxpermdi vs43, vs43, vs47, 0 + xxperm vs43, vs43, permute_mask +#ifndef TRMMKERNEL + lxvp vs24, 0(CO) +#endif + xxperm vs0, vs32, permute_mask + xxperm vs4, vs40, permute_mask +#ifndef TRMMKERNEL + lxvp vs26, 32(CO) +#endif + xxperm vs1, vs33, permute_mask + xxperm vs5, vs41, permute_mask + xxperm vs2, vs34, permute_mask + xxperm vs6, vs42, permute_mask + xxperm vs3, vs35, permute_mask + xxperm vs7, vs43, permute_mask + AGGREGATE_REALS_IMAGES vs32, vs0, vs40, vs4 + AGGREGATE_REALS_IMAGES vs33, vs1, vs41, vs5 + AGGREGATE_REALS_IMAGES vs34, vs2, vs42, vs6 + AGGREGATE_REALS_IMAGES vs35, vs3, vs43, vs7 + /*inner reverse save_permute and store vs28 */ + xxpermdi vs28,save_permute_1,save_permute_1, 2 + /*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULT_APLHA_PART1 vs32, vs40, vs0, vs1 + MULT_APLHA_PART1 vs33, vs41, vs2, vs3 + MULT_APLHA_PART1 vs34, vs42, vs4, vs5 + MULT_APLHA_PART1 vs35, vs43, vs6, vs7 + MULT_APLHA_PART2 vs32, vs40, vs0, vs1 + MULT_APLHA_PART2 vs33, vs41, vs2, vs3 + MULT_APLHA_PART2 vs34, vs42, vs4, vs5 + MULT_APLHA_PART2 vs35, vs43, vs6, vs7 +/* reconstruct r, i pairs*/ + xxperm vs0, vs1, vs28 + xxperm vs2, vs3, vs28 + xxperm vs4, vs5, vs28 + xxperm vs6, vs7, vs28 +#ifndef TRMMKERNEL + /* add */ + xvaddsp vs24, vs24, vs2 + xvaddsp vs25, vs25, vs0 + xvaddsp vs26, vs26, vs6 + xvaddsp vs27, vs27, vs4 + stxvp vs24, 0(CO) + stxvp vs26, 32(CO) +#else +/* reconstruct r, i pairs*/ + stxv vs0, 0(CO) + stxv vs2, 16(CO) + stxv vs4, 32(CO) + stxv vs6, 48(CO) +#endif + addi CO, CO, 64 +.endm + +/* macros for N=1 and M=4 +**********************************************************************************************/ + +.macro ZERO1x4 + xxsetaccz 0 + xxsetaccz 1 +.endm + +.macro LOAD1x4 + LOAD1x4O 0, 0 +.endm + +.macro LOAD1x4O OffsetA, OffsetB + lxsd v2, (\OffsetB+0)(BO) + lxvp vs32, (\OffsetA+0)(AO) +.endm + +.macro END1x4_NORMAL + END1x4 AO, BO, 32,8 +.endm + +.macro END1x4_WITHOUT_ADD + END1x4 AO, BO, 0, 0 +.endm + +.macro END1x4 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvf32gerpp 0, 34, 33 + xvf32gerpp 1, 34, 32 +.endm + +.macro LOAD1x4_2 + LOAD1x4_2O 0, 0 +.endm + +.macro LOAD1x4_2O OffsetA, OffsetB + lxv vs34, (\OffsetB)(BO) + lxvp vs32, (0+\OffsetA)(AO) + vspltisb v6, 0 + xxpermdi vs35, vs34, vs38, 0 + xxpermdi vs34, vs34, vs38, 2 + lxvp vs36, (32+\OffsetA)(AO) +.endm + +.macro END1x4_2 + /*for load2 offset will be 64 and 16*/ + KERNEL1x4_2 AO, BO, 64, 16, 0, 1, 1 +.endm + +.macro KERNEL1x4_E2 OffsetA, OffsetB, Index, IsLast + KERNEL1x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL1x4_L2 OffsetA, OffsetB, Index, IsLast + KERNEL1x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL1x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete + xvf32gerpp 0, 34, 33 + xvf32gerpp 1, 34, 32 +.if \Complete==0 + lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG) +.endif + xvf32gerpp 0, 35, 37 + xvf32gerpp 1, 35, 36 +.if \Complete==0 + lxv vs34, DISP2(\Index, \OffsetB)(\BREG) + xxpermdi vs35, vs34, vs38, 0 + xxpermdi vs34, vs34, vs38, 2 + lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG) +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP2(\Index, \OffsetB) + addi \AREG, \AREG, DISP8(\Index, \OffsetA) +.else + addi \BREG, \BREG, DISP2(\Index, 16) + addi \AREG, \AREG, DISP8(\Index, 64) +.endif +.endif +.endm + +.macro KERNEL1x4 + LOAD1x4 + END1x4 AO, BO, 32,8 +.endm + +.macro SAVE1x4 + SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44 + SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45 + xxpermdi vs32, vs32, vs36, 0 + xxpermdi vs40, vs40, vs44, 0 + xxpermdi vs33, vs33, vs37, 0 + xxpermdi vs41, vs41, vs45, 0 + xxperm vs40, vs40, permute_mask + xxperm vs41, vs41, permute_mask +#ifndef TRMMKERNEL + lxvp vs24, 0(CO) +#endif + xxperm vs0, vs32, permute_mask + xxperm vs4, vs40, permute_mask + xxperm vs1, vs33, permute_mask + xxperm vs5, vs41, permute_mask + AGGREGATE_REALS_IMAGES vs32, vs0, vs40, vs4 + AGGREGATE_REALS_IMAGES vs33, vs1, vs41, vs5 + /*inner reverse save_permute and store vs28 */ + xxpermdi vs28,save_permute_1,save_permute_1, 2 + /*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULT_APLHA_PART1 vs32, vs40, vs0, vs1 + MULT_APLHA_PART1 vs33, vs41, vs2, vs3 + MULT_APLHA_PART2 vs32, vs40, vs0, vs1 + MULT_APLHA_PART2 vs33, vs41, vs2, vs3 +/* reconstruct r, i pairs*/ + xxperm vs0, vs1, vs28 + xxperm vs2, vs3, vs28 +#ifndef TRMMKERNEL + /* add */ + xvaddsp vs24, vs24, vs2 + xvaddsp vs25, vs25, vs0 + stxvp vs24, 0(CO) +#else +/* reconstruct r, i pairs*/ + stxv vs0, 0(CO) + stxv vs2, 16(CO) +#endif + addi CO, CO, 32 +.endm + +/* macros for N=1 and M=2 +**********************************************************************************************/ + +.macro ZERO1x2 + xxlxor vs32, vs32, vs32 + xxlxor vs40, vs40, vs40 +.endm + +.macro LOAD1x2 + LOAD1x2O 0, 0 +.endm + +.macro LOAD1x2O OffsetA, OffsetB + lxsd vs4, (\OffsetB+0)(BO) + lxv vs0, (\OffsetA+0)(AO) + xxspltd vs24, vs36, 0 + xxperm vs26, vs24, permute_mask +.endm + +.macro END1x2_NORMAL + END1x2 AO, BO, 16,8 +.endm + +.macro END1x2_WITHOUT_ADD + END1x2 AO, BO, 0, 0 +.endm + +.macro END1x2 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddasp vs32, vs0, vs24 + xvmaddasp vs40, vs0, vs26 +.endm + +.macro LOAD1x2_2 + LOAD1x2_2O 0, 0 +.endm + +.macro LOAD1x2_2O OffsetA, OffsetB + lxv vs27, (\OffsetB)(BO) + lxvp vs4, (0+\OffsetA)(AO) + xxspltd vs8, vs27, 1 + xxspltd vs24, vs27, 0 + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask +.endm + +.macro END1x2_2 + /*for load2 offset will be 32 and 16*/ + KERNEL1x2_2 AO, BO, 32, 16, 0, 1, 1 +.endm + +.macro KERNEL1x2_E2 OffsetA, OffsetB, Index, IsLast + KERNEL1x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL1x2_L2 OffsetA, OffsetB, Index, IsLast + KERNEL1x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL1x2_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete +.if \Complete==0 + lxv vs27, DISP2(\Index, \OffsetB)(\BREG) +.endif + xvmaddasp vs32, vs5, vs8 + xvmaddasp vs40, vs5, vs10 + +.if \Complete==0 + xxspltd vs8, vs27, 1 + xxperm vs10, vs8, permute_mask +.endif + xvmaddasp vs32, vs4, vs24 + xvmaddasp vs40, vs4, vs26 +.if \Complete==0 + lxvp vs4, DISP4(\Index, 0+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxspltd vs24, vs27, 0 + xxperm vs26, vs24, permute_mask +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP2(\Index, \OffsetB) + addi \AREG, \AREG, DISP4(\Index, \OffsetA) +.else + addi \BREG, \BREG, DISP2(\Index, 16) + addi \AREG, \AREG, DISP4(\Index, 32) +.endif +.endif +.endm + +.macro KERNEL1x2 + LOAD1x2 + END1x2 AO, BO, 16,8 +.endm + +.macro SAVE1x2 +#ifndef TRMMKERNEL + lxv vs24, 0(CO) +#endif + xxperm vs0, vs32, permute_mask + xxperm vs4, vs40, permute_mask + AGGREGATE_REALS_IMAGES vs32, vs0, vs40, vs4 + /*inner reverse save_permute and store vs28 */ + xxpermdi vs28,save_permute_1,save_permute_1, 2 + /*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULT_APLHA_PART1 vs32, vs40, vs0, vs1 + MULT_APLHA_PART2 vs32, vs40, vs0, vs1 +/* reconstruct r, i pairs*/ + xxperm vs0, vs1, vs28 +#ifndef TRMMKERNEL + /* add */ + xvaddsp vs24, vs24, vs0 + stxv vs24, 0(CO) +#else +/* reconstruct r, i pairs*/ + stxv vs0, 0(CO) +#endif + addi CO, CO, 16 +.endm + +/* macros for N=1 and M=1 +**********************************************************************************************/ +.macro ZERO1x1 + xxlxor vs32, vs32, vs32 + xxlxor vs40, vs40, vs40 +.endm + +.macro LOAD1x1 + LOAD1x1O 0, 0 +.endm + +.macro LOAD1x1O OffsetA, OffsetB + lxsd v4, (\OffsetB+0)(BO) + lxsd v5, (\OffsetA+0)(AO) + xxperm vs38, vs36, permute_mask +.endm + +.macro END1x1_NORMAL + END1x1 AO, BO,8,8 +.endm + +.macro END1x1_WITHOUT_ADD + END1x1 AO, BO, 0, 0 +.endm + +.macro END1x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddasp vs32, vs37, vs36 + xvmaddasp vs40, vs37, vs38 +.endm + +.macro LOAD1x1_2 + LOAD1x1_2O 0, 0 +.endm + +.macro LOAD1x1_2O OffsetA, OffsetB + lxv vs8, (\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + xxperm vs10, vs8, permute_mask +.endm + +.macro END1x1_2 + /*for load2 offset will be 16 and 16*/ + KERNEL1x1_2 AO, BO, 16, 16, 0, 1, 1 +.endm + +.macro KERNEL1x1_E2 OffsetA, OffsetB, Index, IsLast + KERNEL1x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL1x1_L2 OffsetA, OffsetB, Index, IsLast + KERNEL1x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL1x1_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete + xvmaddasp vs32, vs4, vs8 + xvmaddasp vs40, vs4, vs10 +.if \Complete==0 + lxv vs8, DISP2(\Index, \OffsetB)(\BREG) + lxv vs4, DISP2(\Index, \OffsetB)(\AREG) + xxperm vs10, vs8, permute_mask +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP2(\Index, \OffsetB) + addi \AREG, \AREG, DISP2(\Index, \OffsetA) +.else + addi \BREG, \BREG, DISP2(\Index, 16) + addi \AREG, \AREG, DISP2(\Index, 16) +.endif +.endif +.endm + +.macro KERNEL1x1 + LOAD1x1 + END1x1 AO, BO, 8,8 +.endm + +.macro SAVE1x1 +#ifndef TRMMKERNEL + lxsd v4, 0(CO) +#endif + /*aggregate x2*/ + xxpermdi vs33, vs32, vs32, 2 + xxpermdi vs41, vs40, vs40, 2 + xvaddsp vs32, vs32, vs33 + xvaddsp vs40, vs40, vs41 + + xxperm vs0, vs32, permute_mask + xxperm vs4, vs40, permute_mask + AGGREGATE_REALS_IMAGES vs32, vs0, vs40, vs4 + /*inner reverse save_permute and store vs28 */ + xxpermdi vs28,save_permute_1,save_permute_1, 2 + /*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULT_APLHA_PART1 vs32, vs40, vs37, vs1 + MULT_APLHA_PART2 vs32, vs40, vs37, vs1 +/* reconstruct r, i pairs*/ + xxperm vs37, vs1, vs28 +#ifndef TRMMKERNEL + /* add */ + xvaddsp vs36, vs36, vs37 + stxsd v4, 0(CO) +#else +/* vs37 is v5 */ + stxsd v5, 0(CO) +#endif + addi CO, CO, 8 +.endm + +/****************************TRMM POINTER REFRESH MACROSES*************************/ +.macro SHIFT_REG REG1,REG2,SHIFT_VAL +.if \SHIFT_VAL==16 + slwi \REG1, \REG2, 7 +.elseif \SHIFT_VAL==8 + slwi \REG1, \REG2, 6 +.elseif \SHIFT_VAL==4 + slwi \REG1, \REG2, 5 +.elseif \SHIFT_VAL==2 + slwi \REG1, \REG2, 4 +.elseif \SHIFT_VAL==1 + slwi \REG1, \REG2, 3 +.endif +.endm + +/* +//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// ptrbb = bb; +// #else +// ptrba += off*8; +// ptrbb = bb + off*4; +// #endif +*/ +.macro REFRESH_POINTERS PTR_A,PTR_B, OFF_VAL, B_VAL, C_A, C_B +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +/* ptrbb = bb;*/ + mr \PTR_B, \B_VAL /* refresh BPOINT */ +#else +/* +// ptrba =ptrba+ off*C_A; +// ptrbb = bb + off*C_B; +*/ + SHIFT_REG T4, \OFF_VAL, \C_B /* Number of values in B shifted */ + SHIFT_REG T2, \OFF_VAL, \C_A /* Number of values in A shifted */ + add \PTR_B, \B_VAL, T4 /* Add values to BO */ + add \PTR_A, \PTR_A, T2 /* Add values to AO */ +#endif +.endm + +/* +// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +// temp = bk-off; +// #elif defined(LEFT) +// temp = off+8; // number of values in A +// #else +// temp = off+4; // number of values in B +// #endif +*/ +.macro REFRESH_TEMP_BK TEMP_BK, BK_VAL, OFF_VAL, INCR_A, INCR_B + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + /* temp = bk-off;*/ + sub \TEMP_BK, \BK_VAL, \OFF_VAL + #elif defined(LEFT) + /* temp = off+INCR_A; // number of values in A */ + addi \TEMP_BK, \OFF_VAL, \INCR_A + #else + /* temp = off+INCR_B // number of values in B*/ + addi \TEMP_BK, \OFF_VAL, \INCR_B + #endif +.endm +/* +// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// temp = bk - off; +// #ifdef LEFT +// temp -= 8; // number of values in A +// #else +// temp -= 4; // number of values in B +// #endif +// ptrba += temp*8; +// ptrbb += temp*4; +// #endif + +// #ifdef LEFT +// off += 8; // number of values in A +// #endif +*/ +.macro REFRESH_AFTER_SAVE TEMP_BK, BK_VAL, OFF_VAL,PTR_B,PTR_A, C_A, C_B + #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /*temp = bk - off;*/ + sub \TEMP_BK, \BK_VAL, \OFF_VAL + #ifdef LEFT + /*temp -= 8; // number of values in A*/ + addi \TEMP_BK, \TEMP_BK,-\C_A + #else + /*temp -= 4; // number of values in B*/ + addi \TEMP_BK, \TEMP_BK,-\C_B + #endif + /*ptrba += temp*C_A; + ptrbb += temp*C_B;*/ + SHIFT_REG T4, \TEMP_BK, \C_A + SHIFT_REG T2, \TEMP_BK, \C_B + add \PTR_A, \PTR_A, T4/*ptrba+temp*C_A*/ + add \PTR_B, \PTR_B, T2 + #endif + #ifdef LEFT + /*off += 8; // number of values in A*/ + addi \OFF_VAL, \OFF_VAL, \C_A + #endif +.endm diff --git a/kernel/power/cgemv_n.c b/kernel/power/cgemv_n.c index eec3fa37c..8663039c5 100644 --- a/kernel/power/cgemv_n.c +++ b/kernel/power/cgemv_n.c @@ -23,7 +23,10 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ +*****************************************************************************/ +#if !defined(__VEC__) || !defined(__ALTIVEC__) +#include "../arm/zgemv_n.c" +#else #include #include @@ -591,4 +594,4 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, return (0); } - +#endif diff --git a/kernel/power/cgemv_t.c b/kernel/power/cgemv_t.c index 691f7a3d3..1bfc235db 100644 --- a/kernel/power/cgemv_t.c +++ b/kernel/power/cgemv_t.c @@ -23,7 +23,10 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ +*****************************************************************************/ +#if !defined(__VEC__) || !defined(__ALTIVEC__) +#include "../arm/zgemv_t.c" +#else #include "common.h" @@ -595,4 +598,4 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, return (0); } - +#endif diff --git a/kernel/power/crot.c b/kernel/power/crot.c index 2a5835546..84ba5d913 100644 --- a/kernel/power/crot.c +++ b/kernel/power/crot.c @@ -27,7 +27,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) static void crot_kernel_8 (long n, float *x, float *y, float c, float s) { @@ -169,6 +170,7 @@ static void crot_kernel_8 (long n, float *x, float *y, float c, float s) } #endif +#endif int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) @@ -183,7 +185,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT if ( (inc_x == 1) && (inc_y == 1) ) { - +#if defined(__VEC__) || defined(__ALTIVEC__) BLASLONG n1 = n & -8; if ( n1 > 0 ) { @@ -191,7 +193,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT i=n1; ix=2*n1; } - +#endif while(i < n) { temp[0] = c*x[ix] + s*y[ix] ; diff --git a/kernel/power/cswap.c b/kernel/power/cswap.c index 31e02fe5a..5144a2e93 100644 --- a/kernel/power/cswap.c +++ b/kernel/power/cswap.c @@ -36,9 +36,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "cswap_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_32 diff --git a/kernel/power/dasum.c b/kernel/power/dasum.c index d0e060977..999dc677a 100644 --- a/kernel/power/dasum.c +++ b/kernel/power/dasum.c @@ -46,9 +46,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "dasum_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_16 diff --git a/kernel/power/daxpy.c b/kernel/power/daxpy.c index f09611ff0..2de4e0911 100644 --- a/kernel/power/daxpy.c +++ b/kernel/power/daxpy.c @@ -36,9 +36,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "daxpy_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_8 diff --git a/kernel/power/dcopy.c b/kernel/power/dcopy.c index 27b39144b..24279f8a2 100644 --- a/kernel/power/dcopy.c +++ b/kernel/power/dcopy.c @@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "dcopy_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_32 diff --git a/kernel/power/ddot.c b/kernel/power/ddot.c index f985df1c5..c5493015a 100644 --- a/kernel/power/ddot.c +++ b/kernel/power/ddot.c @@ -36,9 +36,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "ddot_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_8 diff --git a/kernel/power/dgemm_kernel_power10.c b/kernel/power/dgemm_kernel_power10.c new file mode 100644 index 000000000..a0bc1a777 --- /dev/null +++ b/kernel/power/dgemm_kernel_power10.c @@ -0,0 +1,864 @@ +/********************************************************************************* +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ +#include "common.h" +#include + +typedef __vector unsigned char vec_t; +typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); +typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); + +#ifdef TRMMKERNEL +#define SAVE_ACC(ACC, J) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] = result[0] * alpha; \ + rowC = (v4sf_t *) &CO[1*ldc+J]; \ + rowC[0] = result[1] * alpha; \ + rowC = (v4sf_t *) &CO[2*ldc+J]; \ + rowC[0] = result[2] * alpha; \ + rowC = (v4sf_t *) &CO[3*ldc+J]; \ + rowC[0] = result[3] * alpha; +#define SAVE_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rowC = (v4sf_t *) &CO[4* ldc+J]; \ + rowC[0] = result[0] * alpha; \ + rowC = (v4sf_t *) &CO[5*ldc+J]; \ + rowC[0] = result[1] * alpha; \ + rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC[0] = result[2] * alpha; \ + rowC = (v4sf_t *) &CO[7*ldc+J]; \ + rowC[0] = result[3] * alpha; +#define SAVE2x4_ACC(ACC, J) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] = result[0] * alpha; \ + rowC = (v4sf_t *) &CO[1* ldc+J]; \ + rowC[0] = result[1] * alpha; +#else +#define SAVE_ACC(ACC, J) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] += result[0] * alpha; \ + rowC = (v4sf_t *) &CO[1*ldc+J]; \ + rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[2*ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v4sf_t *) &CO[3*ldc+J]; \ + rowC[0] += result[3] * alpha; +#define SAVE_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rowC = (v4sf_t *) &CO[4* ldc+J]; \ + rowC[0] += result[0] * alpha; \ + rowC = (v4sf_t *) &CO[5*ldc+J]; \ + rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v4sf_t *) &CO[7*ldc+J]; \ + rowC[0] += result[3] * alpha; +#define SAVE2x4_ACC(ACC, J) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] += result[0] * alpha; \ + rowC = (v4sf_t *) &CO[1* ldc+J]; \ + rowC[0] += result[1] * alpha; +#endif + +#define SET_ACC_ZERO4() \ + __builtin_mma_xxsetaccz (&acc0); \ + __builtin_mma_xxsetaccz (&acc1); \ + __builtin_mma_xxsetaccz (&acc2); \ + __builtin_mma_xxsetaccz (&acc3); + +#define SET_ACC_ZERO8() \ + __builtin_mma_xxsetaccz (&acc0); \ + __builtin_mma_xxsetaccz (&acc1); \ + __builtin_mma_xxsetaccz (&acc2); \ + __builtin_mma_xxsetaccz (&acc3); \ + __builtin_mma_xxsetaccz (&acc4); \ + __builtin_mma_xxsetaccz (&acc5); \ + __builtin_mma_xxsetaccz (&acc6); \ + __builtin_mma_xxsetaccz (&acc7); + +#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +#define REFRESH_TEMP_BK(x, y) \ + temp = k - off; +#elif defined(LEFT) +#define REFRESH_TEMP_BK(x, y) \ + temp = off + x; +#else +#define REFRESH_TEMP_BK(x, y) \ + temp = off + y; +#endif +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +#define REFRESH_POINTERS(x, y) \ + BO = B; \ + REFRESH_TEMP_BK(x, y) +#else +#define REFRESH_POINTERS(x, y) \ + AO += off * x; \ + BO = B + off * y; \ + REFRESH_TEMP_BK(x, y) +#endif + +#ifdef LEFT +#define REFRESH_OFF(x) \ + off += x; +#else +#define REFRESH_OFF(x) +#endif + +#ifdef LEFT +#define UPDATE_TEMP(x, y) \ + temp -= x; +#else +#define UPDATE_TEMP(x, y) \ + temp -= y; +#endif + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +#define REFRESH_TMP_AFTER_SAVE(x, y) \ + temp = k - off; \ + UPDATE_TEMP(x, y) \ + AO += temp * x; \ + BO += temp * y; +#else +#define REFRESH_TMP_AFTER_SAVE(x, y) +#endif + +#define REFRESH_AFTER_SAVE(x,y) \ + REFRESH_TMP_AFTER_SAVE(x, y) \ + REFRESH_OFF(x) +/************************************************************************************* +* GEMM Kernel +*************************************************************************************/ +int +CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, + FLOAT * C, BLASLONG ldc +#ifdef TRMMKERNEL + , BLASLONG offset +#endif + ) +{ + BLASLONG N = n; + BLASLONG i1; +#if defined(TRMMKERNEL) + BLASLONG off; +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#endif + v4sf_t valpha = { alpha, alpha }; + N = n >> 2; + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i, j, temp; + FLOAT *CO; + FLOAT *AO; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + CO = C; + C += ldc << 2; + AO = A; + PREFETCH1 (A, 128); + PREFETCH1 (A, 256); + i = m >> 4; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (16, 4); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + BLASLONG l = 0; + PREFETCH1 (CO, 0); + PREFETCH1 (CO + ldc, 0); + PREFETCH1 (CO + ldc + ldc, 0); + PREFETCH1 (CO + ldc + ldc + ldc, 0); + PREFETCH1 (CO, 128); + PREFETCH1 (CO + ldc, 128); + PREFETCH1 (CO + ldc + ldc, 128); + PREFETCH1 (CO + ldc + ldc + ldc, 128); + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + SET_ACC_ZERO8 (); + for (l = 0; l < temp; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 4]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & BO[l << 2]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); + __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); + __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]); + __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]); + __builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]); + __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]); + __builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc2, 4); + SAVE_ACC (&acc1, 2); + SAVE_ACC (&acc3, 6); + SAVE_ACC (&acc4, 8); + SAVE_ACC (&acc6, 12); + SAVE_ACC (&acc5, 10); + SAVE_ACC (&acc7, 14); + AO += temp << 4; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (16, 4) +#endif + CO += 16; + } + i = (m & 15) >> 3; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 4); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3; + SET_ACC_ZERO4 (); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 3]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & BO[l << 2]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); + __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); + __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc2, 4); + SAVE_ACC (&acc1, 2); + SAVE_ACC (&acc3, 6); + CO += 8; + AO += temp << 3; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 4) +#endif + } + i = (m & 7) >> 2; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 4); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 2]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & BO[l << 2]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc1, 2); + CO += 4; + AO += temp << 2; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 4) +#endif + } + i = (m & 3) >> 1; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 4); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0; + __builtin_mma_xxsetaccz (&acc0); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 1]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & BO[l << 2]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + } + SAVE_ACC (&acc0, 0); + CO += 2; + AO += temp << 1; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 4) +#endif + } + i = (m & 1) >> 0; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 4); +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0 }; + v4sf_t t1 = { 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowA = { AO[l], AO[l] }; + v4sf_t rowB = { BO[l << 2], BO[(l << 2) + 1] }; + v4sf_t rowB1 = { BO[(l << 2) + 2], BO[(l << 2) + 3] }; + t += rowA * rowB; + t1 += rowA * rowB1; + } + t = t * valpha; + t1 = t1 * valpha; +#if defined(TRMMKERNEL) + CO[0 * ldc] = t[0]; + CO[1 * ldc] = t[1]; + CO[2 * ldc] = t1[0]; + CO[3 * ldc] = t1[1]; +#else + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; + CO[2 * ldc] += t1[0]; + CO[3 * ldc] += t1[1]; +#endif + CO += 1; + AO += temp; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 4) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 4; // number of values in A +#endif + B += k << 2; + } + N = (n & 3) >> 1; + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i, j, temp; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + FLOAT *CO; + FLOAT *AO; + CO = C; + C += ldc << 1; + AO = A; + i = m >> 4; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (16, 2); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + SET_ACC_ZERO8 (); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + FLOAT t[4] = { 0, 0, 0, 0 }; + t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & t[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + vec_t *rowA = (vec_t *) & AO[l << 4]; + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); + __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); + __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]); + __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]); + __builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]); + __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]); + __builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]); + } + SAVE2x4_ACC (&acc0, 0); + SAVE2x4_ACC (&acc1, 2); + SAVE2x4_ACC (&acc2, 4); + SAVE2x4_ACC (&acc3, 6); + SAVE2x4_ACC (&acc4, 8); + SAVE2x4_ACC (&acc5, 10); + SAVE2x4_ACC (&acc6, 12); + SAVE2x4_ACC (&acc7, 14); + CO += 16; + AO += temp << 4; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (16, 2) +#endif + } + i = (m & 15) >> 3; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 2); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3; + SET_ACC_ZERO4 (); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + FLOAT t[4] = { 0, 0, 0, 0 }; + t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & t[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + vec_t *rowA = (vec_t *) & AO[l << 3]; + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); + __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); + __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]); + } + SAVE2x4_ACC (&acc0, 0); + SAVE2x4_ACC (&acc1, 2); + SAVE2x4_ACC (&acc2, 4); + SAVE2x4_ACC (&acc3, 6); + CO += 8; + AO += temp << 3; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 2) +#endif + } + i = (m & 7) >> 2; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 2); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + FLOAT t[4] = { 0, 0, 0, 0 }; + t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & t[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + vec_t *rowA = (vec_t *) & AO[l << 2]; + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); + } + SAVE2x4_ACC (&acc0, 0); + SAVE2x4_ACC (&acc1, 2); + CO += 4; + AO += temp << 2; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 2) +#endif + } + i = (m & 3) >> 1; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 2); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0; + __builtin_mma_xxsetaccz (&acc0); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + FLOAT t[4] = { 0, 0, 0, 0 }; + t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & t[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + vec_t *rowA = (vec_t *) & AO[l << 1]; + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + } + SAVE2x4_ACC (&acc0, 0); + CO += 2; + AO += temp << 1; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 2) +#endif + } + i = (m & 1) >> 0; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 2); +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowA = { AO[l], AO[l] }; + v4sf_t rowB = { BO[l << 1], BO[(l << 1) + 1] }; + t += rowA * rowB; + } + t = t * valpha; +#if defined(TRMMKERNEL) + CO[0 * ldc] = t[0]; + CO[1 * ldc] = t[1]; +#else + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; +#endif + CO += 1; + AO += temp; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 2) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; // number of values in A +#endif + B += k << 1; + } + N = (n & 1) >> 0; + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i, temp; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + FLOAT *CO; + FLOAT *AO; + CO = C; + C += ldc; + AO = A; + i = m; + while (i >= 16) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (16, 1) +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0 }; + v4sf_t t1 = { 0, 0 }; + v4sf_t t2 = { 0, 0 }; + v4sf_t t3 = { 0, 0 }; + v4sf_t t4 = { 0, 0 }; + v4sf_t t5 = { 0, 0 }; + v4sf_t t6 = { 0, 0 }; + v4sf_t t7 = { 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowB = { BO[l], BO[l] }; + v4sf_t rowA = { AO[l << 4], AO[(l << 4) + 1] }; + v4sf_t rowA1 = { AO[(l << 4) + 2], AO[(l << 4) + 3] }; + v4sf_t rowA2 = { AO[(l << 4) + 4], AO[(l << 4) + 5] }; + v4sf_t rowA3 = { AO[(l << 4) + 6], AO[(l << 4) + 7] }; + v4sf_t rowA4 = { AO[(l << 4) + 8], AO[(l << 4) + 9] }; + v4sf_t rowA5 = { AO[(l << 4) + 10], AO[(l << 4) + 11] }; + v4sf_t rowA6 = { AO[(l << 4) + 12], AO[(l << 4) + 13] }; + v4sf_t rowA7 = { AO[(l << 4) + 14], AO[(l << 4) + 15] }; + t += rowA * rowB; + t1 += rowA1 * rowB; + t2 += rowA2 * rowB; + t3 += rowA3 * rowB; + t4 += rowA4 * rowB; + t5 += rowA5 * rowB; + t6 += rowA6 * rowB; + t7 += rowA7 * rowB; + } + t = t * valpha; + t1 = t1 * valpha; + t2 = t2 * valpha; + t3 = t3 * valpha; + t4 = t4 * valpha; + t5 = t5 * valpha; + t6 = t6 * valpha; + t7 = t7 * valpha; +#if defined(TRMMKERNEL) + CO[0] = t[0]; + CO[1] = t[1]; + CO[2] = t1[0]; + CO[3] = t1[1]; + CO[4] = t2[0]; + CO[5] = t2[1]; + CO[6] = t3[0]; + CO[7] = t3[1]; + CO[8] = t4[0]; + CO[9] = t4[1]; + CO[10] = t5[0]; + CO[11] = t5[1]; + CO[12] = t6[0]; + CO[13] = t6[1]; + CO[14] = t7[0]; + CO[15] = t7[1]; +#else + CO[0] += t[0]; + CO[1] += t[1]; + CO[2] += t1[0]; + CO[3] += t1[1]; + CO[4] += t2[0]; + CO[5] += t2[1]; + CO[6] += t3[0]; + CO[7] += t3[1]; + CO[8] += t4[0]; + CO[9] += t4[1]; + CO[10] += t5[0]; + CO[11] += t5[1]; + CO[12] += t6[0]; + CO[13] += t6[1]; + CO[14] += t7[0]; + CO[15] += t7[1]; +#endif + AO += temp << 4; + BO += temp; + CO += 16; + i -= 16; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (16, 1) +#endif + } + while (i >= 8) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 1) +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0 }; + v4sf_t t1 = { 0, 0 }; + v4sf_t t2 = { 0, 0 }; + v4sf_t t3 = { 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowB = { BO[l], BO[l] }; + v4sf_t rowA = { AO[l << 3], AO[(l << 3) + 1] }; + v4sf_t rowA1 = { AO[(l << 3) + 2], AO[(l << 3) + 3] }; + v4sf_t rowA2 = { AO[(l << 3) + 4], AO[(l << 3) + 5] }; + v4sf_t rowA3 = { AO[(l << 3) + 6], AO[(l << 3) + 7] }; + t += rowA * rowB; + t1 += rowA1 * rowB; + t2 += rowA2 * rowB; + t3 += rowA3 * rowB; + } + t = t * valpha; + t1 = t1 * valpha; + t2 = t2 * valpha; + t3 = t3 * valpha; +#if defined(TRMMKERNEL) + CO[0] = t[0]; + CO[1] = t[1]; + CO[2] = t1[0]; + CO[3] = t1[1]; + CO[4] = t2[0]; + CO[5] = t2[1]; + CO[6] = t3[0]; + CO[7] = t3[1]; +#else + CO[0] += t[0]; + CO[1] += t[1]; + CO[2] += t1[0]; + CO[3] += t1[1]; + CO[4] += t2[0]; + CO[5] += t2[1]; + CO[6] += t3[0]; + CO[7] += t3[1]; +#endif + AO += temp << 3; + BO += temp; + CO += 8; + i -= 8; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 1) +#endif + } + while (i >= 4) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 1) +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0 }; + v4sf_t t1 = { 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowB = { BO[l], BO[l] }; + v4sf_t rowA = { AO[l << 2], AO[(l << 2) + 1] }; + v4sf_t rowA1 = { AO[(l << 2) + 2], AO[(l << 2) + 3] }; + t += rowA * rowB; + t1 += rowA1 * rowB; + } + t = t * valpha; + t1 = t1 * valpha; +#if defined(TRMMKERNEL) + CO[0] = t[0]; + CO[1] = t[1]; + CO[2] = t1[0]; + CO[3] = t1[1]; +#else + CO[0] += t[0]; + CO[1] += t[1]; + CO[2] += t1[0]; + CO[3] += t1[1]; +#endif + AO += temp << 2; + BO += temp; + CO += 4; + i -= 4; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 1) +#endif + } + while (i >= 2) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 1) +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowB = { BO[l], BO[l] }; + v4sf_t rowA = { AO[l << 1], AO[(l << 1) + 1] }; + t += rowA * rowB; + } + t = t * valpha; +#if defined(TRMMKERNEL) + CO[0] = t[0]; + CO[1] = t[1]; +#else + CO[0] += t[0]; + CO[1] += t[1]; +#endif + AO += temp << 1; + BO += temp; + CO += 2; + i -= 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 1) +#endif + } + while (i >= 1) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 1) +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + FLOAT t = 0; + for (l = 0; l < temp; l++) + { + t += AO[l] * BO[l]; + } + AO += temp; + BO += temp; +#if defined(TRMMKERNEL) + CO[0] = t * alpha; +#else + CO[0] += t * alpha; +#endif + CO += 1; + i -= 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 1) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; // number of values in A +#endif + B += k; + } + return 0; +} diff --git a/kernel/power/dgemv_n.c b/kernel/power/dgemv_n.c index 1a3d7669c..ac365b3b2 100644 --- a/kernel/power/dgemv_n.c +++ b/kernel/power/dgemv_n.c @@ -38,9 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "dgemv_n_microk_power8.c" #endif +#endif #define NBMAX 4096 diff --git a/kernel/power/dgemv_n_microk_power10.c b/kernel/power/dgemv_n_microk_power10.c new file mode 100644 index 000000000..4be8a5f9b --- /dev/null +++ b/kernel/power/dgemv_n_microk_power10.c @@ -0,0 +1,268 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/30 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#define HAVE_KERNEL_4x4 1 + +static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y, double alpha) +{ + double *a0; + double *a1; + double *a2; + double *a3; + + __asm__ + ( + "lxvp 40, 0(%10) \n\t" // x0, x1 + XXSPLTD_S(32,%x9,0) // alpha, alpha + + "sldi %6, %13, 3 \n\t" // lda * sizeof (double) + + "xvmuldp 34, 41, 32 \n\t" // x0 * alpha, x1 * alpha + "xvmuldp 35, 40, 32 \n\t" // x2 * alpha, x3 * alpha + + "add %4, %3, %6 \n\t" // a0 = ap, a1 = a0 + lda + "add %6, %6, %6 \n\t" // 2 * lda + + XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha + XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha + XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha + XXSPLTD_S(35,35,0) // x3 * alpha, x3 * alpha + + "add %5, %3, %6 \n\t" // a2 = a0 + 2 * lda + "add %6, %4, %6 \n\t" // a3 = a1 + 2 * lda + + "dcbt 0, %3 \n\t" + "dcbt 0, %4 \n\t" + "dcbt 0, %5 \n\t" + "dcbt 0, %6 \n\t" + + "lxvp 40, 0(%3) \n\t" // a0[0], a0[1] + + "lxvp 42, 0(%4) \n\t" // a1[0], a1[1] + + "lxvp 44, 0(%5) \n\t" // a2[0], a2[1] + + "lxvp 46, 0(%6) \n\t" // a3[0], a3[1] + + "dcbt 0, %2 \n\t" + + "addi %3, %3, 32 \n\t" + "addi %4, %4, 32 \n\t" + "addi %5, %5, 32 \n\t" + "addi %6, %6, 32 \n\t" + + "addic. %1, %1, -4 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "lxvp 36, 0(%2) \n\t" // y0, y1 + + "xvmaddadp 36, 40, 32 \n\t" + "xvmaddadp 37, 41, 32 \n\t" + + "lxvp 40, 0(%3) \n\t" // a0[0], a0[1] + + "xvmaddadp 36, 42, 33 \n\t" + "addi %3, %3, 32 \n\t" + "xvmaddadp 37, 43, 33 \n\t" + + "lxvp 42, 0(%4) \n\t" // a1[0], a1[1] + + "xvmaddadp 36, 44, 34 \n\t" + "addi %4, %4, 32 \n\t" + "xvmaddadp 37, 45, 34 \n\t" + + "lxvp 44, 0(%5) \n\t" // a2[0], a2[1] + + "xvmaddadp 36, 46, 35 \n\t" + "addi %5, %5, 32 \n\t" + "xvmaddadp 37, 47, 35 \n\t" + + "stxvp 36, 0(%2) \n\t" // y0, y1 + + "lxvp 46, 0(%6) \n\t" // a3[0], a3[1] + + "addi %6, %6, 32 \n\t" + "addi %2, %2, 32 \n\t" + + "addic. %1, %1, -4 \n\t" + "ble two%= \n\t" + + + "lxvp 36, 0(%2) \n\t" // y0, y1 + + "xvmaddadp 36, 40, 32 \n\t" + "xvmaddadp 37, 41, 32 \n\t" + + "lxvp 40, 0(%3) \n\t" // a0[0], a0[1] + + "xvmaddadp 36, 42, 33 \n\t" + "addi %3, %3, 32 \n\t" + "xvmaddadp 37, 43, 33 \n\t" + + "lxvp 42, 0(%4) \n\t" // a1[0], a1[1] + + "xvmaddadp 36, 44, 34 \n\t" + "addi %4, %4, 32 \n\t" + "xvmaddadp 37, 45, 34 \n\t" + + "lxvp 44, 0(%5) \n\t" // a2[0], a2[1] + + "xvmaddadp 36, 46, 35 \n\t" + "addi %5, %5, 32 \n\t" + "xvmaddadp 37, 47, 35 \n\t" + + "stxvp 36, 0(%2) \n\t" // y0, y1 + + "lxvp 46, 0(%6) \n\t" // a3[0], a3[1] + + "addi %6, %6, 32 \n\t" + "addi %2, %2, 32 \n\t" + + "addic. %1, %1, -4 \n\t" + "ble two%= \n\t" + + + "lxvp 36, 0(%2) \n\t" // y0, y1 + + "xvmaddadp 36, 40, 32 \n\t" + "xvmaddadp 37, 41, 32 \n\t" + + "lxvp 40, 0(%3) \n\t" // a0[0], a0[1] + + "xvmaddadp 36, 42, 33 \n\t" + "addi %3, %3, 32 \n\t" + "xvmaddadp 37, 43, 33 \n\t" + + "lxvp 42, 0(%4) \n\t" // a1[0], a1[1] + + "xvmaddadp 36, 44, 34 \n\t" + "addi %4, %4, 32 \n\t" + "xvmaddadp 37, 45, 34 \n\t" + + "lxvp 44, 0(%5) \n\t" // a2[0], a2[1] + + "xvmaddadp 36, 46, 35 \n\t" + "addi %5, %5, 32 \n\t" + "xvmaddadp 37, 47, 35 \n\t" + + "stxvp 36, 0(%2) \n\t" // y0, y1 + + "lxvp 46, 0(%6) \n\t" // a3[0], a3[1] + + "addi %6, %6, 32 \n\t" + "addi %2, %2, 32 \n\t" + + "addic. %1, %1, -4 \n\t" + "ble two%= \n\t" + + + "lxvp 36, 0(%2) \n\t" // y0, y1 + + "xvmaddadp 36, 40, 32 \n\t" + "xvmaddadp 37, 41, 32 \n\t" + + "lxvp 40, 0(%3) \n\t" // a0[0], a0[1] + + "xvmaddadp 36, 42, 33 \n\t" + "addi %3, %3, 32 \n\t" + "xvmaddadp 37, 43, 33 \n\t" + + "lxvp 42, 0(%4) \n\t" // a1[0], a1[1] + + "xvmaddadp 36, 44, 34 \n\t" + "addi %4, %4, 32 \n\t" + "xvmaddadp 37, 45, 34 \n\t" + + "lxvp 44, 0(%5) \n\t" // a2[0], a2[1] + + "xvmaddadp 36, 46, 35 \n\t" + "addi %5, %5, 32 \n\t" + "xvmaddadp 37, 47, 35 \n\t" + + "stxvp 36, 0(%2) \n\t" // y0, y1 + + "lxvp 46, 0(%6) \n\t" // a3[0], a3[1] + + "addi %6, %6, 32 \n\t" + "addi %2, %2, 32 \n\t" + + "addic. %1, %1, -4 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "lxvp 36, 0(%2) \n\t" // y0, y1 + + "xvmaddadp 36, 40, 32 \n\t" + "xvmaddadp 37, 41, 32 \n\t" + + "xvmaddadp 36, 42, 33 \n\t" + "xvmaddadp 37, 43, 33 \n\t" + + "xvmaddadp 36, 44, 34 \n\t" + "xvmaddadp 37, 45, 34 \n\t" + + "xvmaddadp 36, 46, 35 \n\t" + "xvmaddadp 37, 47, 35 \n\t" + + "stxvp 36, 0(%2) \n\t" // y0, y1 + + "#n=%1 ap=%8=%12 lda=%13 x=%7=%10 y=%0=%2 alpha=%9 o16=%11\n" + "#a0=%3 a1=%4 a2=%5 a3=%6" + : + "+m" (*y), + "+r" (n), // 1 + "+b" (y), // 2 + "=b" (a0), // 3 + "=b" (a1), // 4 + "=&b" (a2), // 5 + "=&b" (a3) // 6 + : + "m" (*x), + "m" (*ap), + "d" (alpha), // 9 + "r" (x), // 10 + "b" (16), // 11 + "3" (ap), // 12 + "4" (lda) // 13 + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47" + ); +} diff --git a/kernel/power/dgemv_n_power10.c b/kernel/power/dgemv_n_power10.c new file mode 100644 index 000000000..ad5f1ba0d --- /dev/null +++ b/kernel/power/dgemv_n_power10.c @@ -0,0 +1,565 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +typedef __vector unsigned char vec_t; +typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); +typedef __vector_pair __attribute__((aligned(8))) vecp_t; + +#include "dgemv_n_microk_power10.c" + +#define MMA(X, APTR, ACC) \ + rX = (vec_t *) & X; \ + rowA = *((vecp_t*)((void*)&APTR)); \ + __builtin_mma_xvf64gerpp (ACC, rowA, rX[0]); + +#define SAVE(ACC, Z) \ + rowC = (v4sf_t *) &y[Z]; \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + result[0][1] = result[1][0]; \ + result[2][1] = result[3][0]; \ + rowC[0] += valpha * result[0]; \ + rowC[1] += valpha * result[2]; + +void +dgemv_kernel_4x128 (BLASLONG n, FLOAT * a_ptr, BLASLONG lda, FLOAT * xo, + FLOAT * y, FLOAT alpha) +{ + BLASLONG i, j, tmp; + FLOAT *a0 = a_ptr; + FLOAT *x1 = xo; + vector double valpha = { alpha, alpha }; + v4sf_t *rowC; + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + v4sf_t result[4]; + vecp_t rowA; + vec_t *rX; + tmp = (n / 32) * 32; + for (i = 0; i < tmp; i += 32) + { + xo = x1; + a0 = a_ptr; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + __builtin_mma_xxsetaccz (&acc2); + __builtin_mma_xxsetaccz (&acc3); + __builtin_mma_xxsetaccz (&acc4); + __builtin_mma_xxsetaccz (&acc5); + __builtin_mma_xxsetaccz (&acc6); + __builtin_mma_xxsetaccz (&acc7); + for (j = 0; j < 32; j++) + { + __builtin_prefetch (xo+j); + __builtin_prefetch (a0+i+j+lda); + MMA (xo[j], a0[i + 0 + j * lda], &acc0); + MMA (xo[j], a0[i + 4 + j * lda], &acc1); + MMA (xo[j], a0[i + 8 + j * lda], &acc2); + MMA (xo[j], a0[i + 12 + j * lda], &acc3); + MMA (xo[j], a0[i + 16 + j * lda], &acc4); + MMA (xo[j], a0[i + 20 + j * lda], &acc5); + MMA (xo[j], a0[i + 24 + j * lda], &acc6); + MMA (xo[j], a0[i + 28 + j * lda], &acc7); + } + xo += 32; + a0 += lda << 5; + for (j = 0; j < 32; j++) + { + __builtin_prefetch (xo+j); + __builtin_prefetch (a0+i+j+lda); + MMA (xo[j], a0[i + 0 + j * lda], &acc0); + MMA (xo[j], a0[i + 4 + j * lda], &acc1); + MMA (xo[j], a0[i + 8 + j * lda], &acc2); + MMA (xo[j], a0[i + 12 + j * lda], &acc3); + MMA (xo[j], a0[i + 16 + j * lda], &acc4); + MMA (xo[j], a0[i + 20 + j * lda], &acc5); + MMA (xo[j], a0[i + 24 + j * lda], &acc6); + MMA (xo[j], a0[i + 28 + j * lda], &acc7); + } + xo += 32; + a0 += lda << 5; + for (j = 0; j < 32; j++) + { + __builtin_prefetch (xo+j); + __builtin_prefetch (a0+i+j+lda); + MMA (xo[j], a0[i + 0 + j * lda], &acc0); + MMA (xo[j], a0[i + 4 + j * lda], &acc1); + MMA (xo[j], a0[i + 8 + j * lda], &acc2); + MMA (xo[j], a0[i + 12 + j * lda], &acc3); + MMA (xo[j], a0[i + 16 + j * lda], &acc4); + MMA (xo[j], a0[i + 20 + j * lda], &acc5); + MMA (xo[j], a0[i + 24 + j * lda], &acc6); + MMA (xo[j], a0[i + 28 + j * lda], &acc7); + } + xo += 32; + a0 += lda << 5; + for (j = 0; j < 32; j++) + { + __builtin_prefetch (xo+j); + __builtin_prefetch (a0+i+j+lda); + MMA (xo[j], a0[i + 0 + j * lda], &acc0); + MMA (xo[j], a0[i + 4 + j * lda], &acc1); + MMA (xo[j], a0[i + 8 + j * lda], &acc2); + MMA (xo[j], a0[i + 12 + j * lda], &acc3); + MMA (xo[j], a0[i + 16 + j * lda], &acc4); + MMA (xo[j], a0[i + 20 + j * lda], &acc5); + MMA (xo[j], a0[i + 24 + j * lda], &acc6); + MMA (xo[j], a0[i + 28 + j * lda], &acc7); + } + xo += 32; + a0 += lda << 5; + SAVE (&acc0, i + 0); + SAVE (&acc1, i + 4); + SAVE (&acc2, i + 8); + SAVE (&acc3, i + 12); + SAVE (&acc4, i + 16); + SAVE (&acc5, i + 20); + SAVE (&acc6, i + 24); + SAVE (&acc7, i + 28); + + } + for (i = tmp; i < n; i += 4) + { + xo = x1; + a0 = a_ptr; + __builtin_mma_xxsetaccz (&acc0); + for (j = 0; j < 32; j++) + { + __builtin_prefetch (xo+j); + __builtin_prefetch (a0+i+j+lda); + MMA (xo[j], a0[i + j * lda], &acc0); + } + xo += 32; + a0 += lda << 5; + for (j = 0; j < 32; j++) + { + __builtin_prefetch (xo+j); + __builtin_prefetch (a0+i+j+lda); + MMA (xo[j], a0[i + j * lda], &acc0); + } + xo += 32; + a0 += lda << 5; + for (j = 0; j < 32; j++) + { + __builtin_prefetch (xo+j); + __builtin_prefetch (a0+i+j+lda); + MMA (xo[j], a0[i + j * lda], &acc0); + } + xo += 32; + a0 += lda << 5; + for (j = 0; j < 32; j++) + { + __builtin_prefetch (xo+j); + __builtin_prefetch (a0+i+j+lda); + MMA (xo[j], a0[i + j * lda], &acc0); + } + xo += 32; + a0 += lda << 5; + SAVE (&acc0, i); + } +} + + +#define NBMAX 4096 + +#ifndef HAVE_KERNEL_4x4 + +static void dgemv_kernel_4x4(BLASLONG n, FLOAT *a_ptr, BLASLONG lda, FLOAT *xo, FLOAT *y, FLOAT alpha) +{ + BLASLONG i; + FLOAT x[4] __attribute__ ((aligned (16)));; + FLOAT *a0 = a_ptr; + FLOAT *a1 = a0 + lda; + FLOAT *a2 = a1 + lda; + FLOAT *a3 = a2 + lda; + + + for ( i=0; i<4; i++) + x[i] = xo[i] * alpha; + + for ( i=0; i< n; i+=4 ) + { + y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3]; + y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3]; + y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3]; + y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3]; + } +} + +#endif + +#ifndef HAVE_KERNEL_4x2 + +static void dgemv_kernel_4x2(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *xo, FLOAT *y, FLOAT alpha) +{ + BLASLONG i; + FLOAT x[4] __attribute__ ((aligned (16)));; + + for ( i=0; i<2; i++) + x[i] = xo[i] * alpha; + + for ( i=0; i< n; i+=4 ) + { + y[i] += a0[i]*x[0] + a1[i]*x[1]; + y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1]; + y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1]; + y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1]; + } +} + + +#endif + +#ifndef HAVE_KERNEL_4x1 + +static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT alpha) +{ + BLASLONG i; + FLOAT x[4] __attribute__ ((aligned (16)));; + + for ( i=0; i<1; i++) + x[i] = xo[i] * alpha; + + for ( i=0; i< n; i+=4 ) + { + y[i] += a0[i]*x[0]; + y[i+1] += a0[i+1]*x[0]; + y[i+2] += a0[i+2]*x[0]; + y[i+3] += a0[i+3]*x[0]; + } +} + + +#endif + + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) +{ + BLASLONG i; + if ( inc_dest != 1 ) + { + for ( i=0; i> 7; + n1 = (n - (n128 * 128)) >> 2; + n2 = (n - (n128 * 128)) & 3; + + m3 = m & 3 ; + m1 = m & -4 ; + m2 = (m & (NBMAX-1)) - m3 ; + + y_ptr = y; + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + a_ptr = a; + x_ptr = x; + + if ( inc_y != 1 ) + memset(ybuffer,0,NB*8); + else + ybuffer = y_ptr; + + if ( inc_x == 1 ) + { + + for( i = 0; i < n128 ; i++) + { + dgemv_kernel_4x128(NB,a_ptr,lda,x_ptr,ybuffer,alpha); + a_ptr += lda128; + x_ptr += 128; + } + + for( i = 0; i < n1 ; i++) + { + dgemv_kernel_4x4(NB,a_ptr,lda,x_ptr,ybuffer,alpha); + a_ptr += lda4; + x_ptr += 4; + } + + if ( n2 & 2 ) + { + dgemv_kernel_4x2(NB,a_ptr,a_ptr+lda,x_ptr,ybuffer,alpha); + a_ptr += lda*2; + x_ptr += 2; + } + + + if ( n2 & 1 ) + { + dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,alpha); + a_ptr += lda; + x_ptr += 1; + + } + + + } + else + { + for( i = 0; i < n128 ; i++) + { + FLOAT xbuffer[128] __attribute__ ((aligned (16))); + BLASLONG j; + for ( j = 0; j < 128 ; j++) + { + xbuffer[j] = x_ptr[0]; + x_ptr += inc_x; + } + dgemv_kernel_4x128(NB,a_ptr,lda,xbuffer,ybuffer,alpha); + a_ptr += lda128; + } + + for( i = 0; i < n1 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[1] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[2] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[3] = x_ptr[0]; + x_ptr += inc_x; + dgemv_kernel_4x4(NB,a_ptr,lda,xbuffer,ybuffer,alpha); + a_ptr += lda4; + } + + for( i = 0; i < n2 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha); + a_ptr += lda; + + } + + } + + a += NB; + if ( inc_y != 1 ) + { + add_y(NB,ybuffer,y_ptr,inc_y); + y_ptr += NB * inc_y; + } + else + y_ptr += NB ; + + } + + if ( m3 == 0 ) return(0); + + if ( m3 == 3 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + if ( lda == 3 && inc_x ==1 ) + { + + for( i = 0; i < ( n & -4 ); i+=4 ) + { + + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; + temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + + temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; + temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; + temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; + + a_ptr += 12; + x_ptr += 4; + } + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += 3; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + y_ptr += inc_y; + y_ptr[0] += alpha * temp2; + return(0); + } + + + if ( m3 == 2 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + if ( lda == 2 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4) ; i+=4 ) + { + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; + temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; + a_ptr += 8; + x_ptr += 4; + + } + + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += 2; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + return(0); + } + + if ( m3 == 1 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp = 0.0; + if ( lda == 1 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4); i+=4 ) + { + temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; + + } + + for( ; i < n; i++ ) + { + temp += a_ptr[i] * x_ptr[i]; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp += a_ptr[0] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + } + + } + y_ptr[0] += alpha * temp; + return(0); + } + + + return(0); +} + + diff --git a/kernel/power/dgemv_t.c b/kernel/power/dgemv_t.c index 5d43f673f..c07b3c223 100644 --- a/kernel/power/dgemv_t.c +++ b/kernel/power/dgemv_t.c @@ -25,15 +25,19 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +#if !defined(__VEC__) || !defined(__ALTIVEC__) +#include "../arm/gemv_t.c" +#else + #include "common.h" #define NBMAX 1024 //#define PREFETCH 1 + #include #define HAVE_KERNEL4x8_ASM 1 - #if defined(HAVE_KERNEL4x8_ASM) static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, double *y, double alpha) { @@ -355,7 +359,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do "stxvd2x 39, %[off], %[y] \n\t" "stxvd2x 40, %[off2], %[y] \n\t" - : [memy] "+m" (*(const double (*)[8])y), + : [memy] "+m" (*(double (*)[8])y), [n] "+&r" (n), [a0] "=b" (a0), [a1] "=&b" (a1), @@ -369,7 +373,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do [off2]"=&b" (off2), [temp] "=&b" (tempR) : [memx] "m" (*(const double (*)[n])x), - [mem_ap] "m" (*(const double (*)[]) ap), + [mem_ap] "m" (*(const double (*)[n*8]) ap), [alpha] "d" (alpha), "[a0]" (ap), [x] "b" (x), @@ -883,4 +887,5 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO return (0); } +#endif diff --git a/kernel/power/dgemv_t_power10.c b/kernel/power/dgemv_t_power10.c new file mode 100644 index 000000000..3db4d5785 --- /dev/null +++ b/kernel/power/dgemv_t_power10.c @@ -0,0 +1,840 @@ +/*************************************************************************** +Copyright (c) 2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include "common.h" + +#define NBMAX 1024 +//#define PREFETCH 1 +#include + +#define HAVE_KERNEL4x8_ASM 1 + + +#if defined(HAVE_KERNEL4x8_ASM) +static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, double *y, double alpha) { + + FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; + BLASLONG off2; + BLASLONG tempR; + __asm__( + + "sldi %[temp],%[off], 4 \n\t" // lda * sizeof (double) *2 + "sldi %[off], %[off], 3 \n\t" // lda * sizeof (double) + "xxlxor 34,34,34 \n\t" + "xxlxor 35,34,34 \n\t" + "add %[a2], %[a0], %[temp] \n\t" + "add %[a1], %[a0], %[off] \n\t" + "xxlxor 4,34,34 \n\t" + "xxlxor 5,34,34 \n\t" + "xxlxor 6,34,34 \n\t" + "xxlxor 7,34,34 \n\t" + "add %[a3], %[a2], %[off] \n\t" + "add %[a4], %[a2], %[temp] \n\t" + + "xxlxor 8,34,34 \n\t" + "xxlxor 9,34,34 \n\t" + "add %[a5], %[a3], %[temp] \n\t" + "li %[off],0 \n\t" + "li %[off2],16 \n\t" + + "add %[a6], %[a4], %[temp] \n\t" + "add %[a7], %[a5], %[temp] \n\t" + + + + + "lxvp 32, 0(%[x]) \n\t" + "lxvp 36, 0(%[a0]) \n\t" + "lxvp 38, 0(%[a1]) \n\t" + "lxvp 40, 0(%[a2]) \n\t" + "lxvp 42, 0(%[a3]) \n\t" + "lxvp 44, 0(%[a4]) \n\t" + "lxvp 46, 0(%[a5]) \n\t" + "lxvp 48, 0(%[a6]) \n\t" + "lxvp 50, 0(%[a7]) \n\t" +#if defined(PREFETCH) + "li %[temp],896 \n\t" +#endif + "addic. %[n],%[n],-4 \n\t" + + "li %[off],32 \n\t" + + + "ble- two%= \n\t" + + //-------------------------------------------------- + ".align 5 \n\t" + "one%=: \n\t" + "xvmaddadp 34,36,32 \n\t" + "xvmaddadp 35,38,32 \n\t" + "addi %[off2], %[off2],32 \n\t" + "xvmaddadp 4,40,32 \n\t" + "xvmaddadp 5,42,32 \n\t" + "xvmaddadp 6,44,32 \n\t" + "xvmaddadp 7,46,32 \n\t" + "xvmaddadp 8,48,32 \n\t" + "xvmaddadp 9,50,32 \n\t" + + "xvmaddadp 34,37,33 \n\t" + "xvmaddadp 35,39,33 \n\t" + "lxvp 36, 32(%[a0]) \n\t" + "lxvp 38, 32(%[a1]) \n\t" + "xvmaddadp 4,41,33 \n\t" + "xvmaddadp 5,43,33 \n\t" + "addi %[off], %[off],32 \n\t" + "lxvp 40, 32(%[a2]) \n\t" + "lxvp 42, 32(%[a3]) \n\t" + "xvmaddadp 6,45,33 \n\t" + "xvmaddadp 7,47,33 \n\t" + "lxvp 44, 32(%[a4]) \n\t" + "lxvp 46, 32(%[a5]) \n\t" + "xvmaddadp 8,49,33 \n\t" + "xvmaddadp 9,51,33 \n\t" + + "addic. %[n],%[n],-4 \n\t" + "lxvp 48, 32(%[a6]) \n\t" + "lxvp 50, 32(%[a7]) \n\t" + "lxvp 32, 32(%[x]) \n\t" + "ble- two%= \n\t" + "xvmaddadp 34,36,32 \n\t" + "xvmaddadp 35,38,32 \n\t" + "addi %[off2], %[off2],32 \n\t" + "xvmaddadp 4,40,32 \n\t" + "xvmaddadp 5,42,32 \n\t" + "xvmaddadp 6,44,32 \n\t" + "xvmaddadp 7,46,32 \n\t" + "xvmaddadp 8,48,32 \n\t" + "xvmaddadp 9,50,32 \n\t" + + "xvmaddadp 34,37,33 \n\t" + "xvmaddadp 35,39,33 \n\t" + "lxvp 36, 64(%[a0]) \n\t" + "lxvp 38, 64(%[a1]) \n\t" + "xvmaddadp 4,41,33 \n\t" + "xvmaddadp 5,43,33 \n\t" + "addi %[off], %[off],32 \n\t" + "lxvp 40, 64(%[a2]) \n\t" + "lxvp 42, 64(%[a3]) \n\t" + "xvmaddadp 6,45,33 \n\t" + "xvmaddadp 7,47,33 \n\t" + "lxvp 44, 64(%[a4]) \n\t" + "lxvp 46, 64(%[a5]) \n\t" + "xvmaddadp 8,49,33 \n\t" + "xvmaddadp 9,51,33 \n\t" + + "addic. %[n],%[n],-4 \n\t" + "lxvp 48, 64(%[a6]) \n\t" + "lxvp 50, 64(%[a7]) \n\t" + "lxvp 32, 64(%[x]) \n\t" + "ble- two%= \n\t" + "xvmaddadp 34,36,32 \n\t" + "xvmaddadp 35,38,32 \n\t" +#if defined(PREFETCH) + "addi %[temp],%[temp],128 \n\t" +#endif + "addi %[off2], %[off2],32 \n\t" + "xvmaddadp 4,40,32 \n\t" + "xvmaddadp 5,42,32 \n\t" + "xvmaddadp 6,44,32 \n\t" + "xvmaddadp 7,46,32 \n\t" + "xvmaddadp 8,48,32 \n\t" + "xvmaddadp 9,50,32 \n\t" +#if defined(PREFETCH) + "dcbt %[temp],%[a0] \n\t" +#endif + + "xvmaddadp 34,37,33 \n\t" + "xvmaddadp 35,39,33 \n\t" + "lxvp 36, 96(%[a0]) \n\t" + "lxvp 38, 96(%[a1]) \n\t" + "xvmaddadp 4,41,33 \n\t" + "xvmaddadp 5,43,33 \n\t" +#if defined(PREFETCH) + "dcbt %[temp],%[a1] \n\t" +#endif + "lxvp 40, 96(%[a2]) \n\t" + "lxvp 42, 96(%[a3]) \n\t" + "addi %[off], %[off],32 \n\t" + "xvmaddadp 6,45,33 \n\t" + "xvmaddadp 7,47,33 \n\t" + "lxvp 44, 96(%[a4]) \n\t" + "lxvp 46, 96(%[a5]) \n\t" + "xvmaddadp 8,49,33 \n\t" + "xvmaddadp 9,51,33 \n\t" +#if defined(PREFETCH) + "dcbt %[temp],%[a3] \n\t" +#endif + "lxvp 48, 96(%[a6]) \n\t" + "lxvp 50, 96(%[a7]) \n\t" + "lxvp 32, 96(%[x]) \n\t" + + "addic. %[n],%[n],-4 \n\t" + "ble- two%= \n\t" + + "addi %[off2], %[off2],32 \n\t" +#if defined(PREFETCH) + "dcbt %[temp],%[a2] \n\t" +#endif + "xvmaddadp 34,36,32 \n\t" + "xvmaddadp 35,38,32 \n\t" + "xvmaddadp 4,40,32 \n\t" + "xvmaddadp 5,42,32 \n\t" +#if defined(PREFETCH) + "dcbt %[temp],%[a4] \n\t" +#endif + "xvmaddadp 6,44,32 \n\t" + "xvmaddadp 7,46,32 \n\t" + "xvmaddadp 8,48,32 \n\t" + "xvmaddadp 9,50,32 \n\t" + +#if defined(PREFETCH) + "dcbt %[temp],%[a5] \n\t" +#endif + "xvmaddadp 34,37,33 \n\t" + "xvmaddadp 35,39,33 \n\t" + "lxvp 36, 128(%[a0]) \n\t" + "lxvp 38, 128(%[a1]) \n\t" + "xvmaddadp 4,41,33 \n\t" + "xvmaddadp 5,43,33 \n\t" + "addi %[off], %[off],32 \n\t" + "lxvp 40, 128(%[a2]) \n\t" + "lxvp 42, 128(%[a3]) \n\t" +#if defined(PREFETCH) + "dcbt %[temp],%[a6] \n\t" +#endif + "xvmaddadp 6,45,33 \n\t" + "xvmaddadp 7,47,33 \n\t" + "lxvp 44, 128(%[a4]) \n\t" + "lxvp 46, 128(%[a5]) \n\t" + "xvmaddadp 8,49,33 \n\t" + "xvmaddadp 9,51,33 \n\t" + +#if defined(PREFETCH) + "dcbt %[temp],%[a7] \n\t" +#endif + "addic. %[n],%[n],-4 \n\t" + "lxvp 48, 128(%[a6]) \n\t" + "lxvp 50, 128(%[a7]) \n\t" + "lxvp 32, 128(%[x]) \n\t" +#if defined(PREFETCH) + "dcbt %[temp],%[x] \n\t" +#endif + "addi %[a0], %[a0], 128 \n\t" + "addi %[a1], %[a1], 128 \n\t" + "addi %[a2], %[a2], 128 \n\t" + "addi %[a3], %[a3], 128 \n\t" + "addi %[a4], %[a4], 128 \n\t" + "addi %[a5], %[a5], 128 \n\t" + "addi %[a6], %[a6], 128 \n\t" + "addi %[a7], %[a7], 128 \n\t" + "addi %[x], %[x], 128 \n\t" + "bgt+ one%= \n\t" + ".align 5 \n\t" + "two%=: \n\t" + //-------------------------------------------- + + "xvmaddadp 34,36,32 \n\t" + "xvmaddadp 35,38,32 \n\t" + "xvmaddadp 4,40,32 \n\t" + "xvmaddadp 5,42,32 \n\t" + "xvmaddadp 6,44,32 \n\t" + "xvmaddadp 7,46,32 \n\t" + "xvmaddadp 8,48,32 \n\t" + "xvmaddadp 9,50,32 \n\t" + XXSPLTD_S(36,%x[alpha],0) + "xvmaddadp 34,37,33 \n\t" + "xvmaddadp 35,39,33 \n\t" + "xvmaddadp 4,41,33 \n\t" + "xvmaddadp 5,43,33 \n\t" + "xvmaddadp 6,45,33 \n\t" + "xvmaddadp 7,47,33 \n\t" + "xvmaddadp 8,49,33 \n\t" + "xvmaddadp 9,51,33 \n\t" + + "lxvp 38, 0(%[y]) \n\t" + "lxvp 40, 32(%[y]) \n\t" + + + + XXMRGLD_S(42,35,34) + XXMRGHD_S(43,35,34) + + XXMRGLD_S(44,5,4) + XXMRGHD_S(45,5,4) + + "xvadddp 42,42,43 \n\t" + + XXMRGLD_S(46,7,6) + XXMRGHD_S(47,7,6) + + "xvadddp 44,44,45 \n\t" + + XXMRGLD_S(48,9,8) + XXMRGHD_S(49,9,8) + + "xvadddp 46,46,47 \n\t" + + "xvmaddadp 39,42,36 \n\t" + "xvmaddadp 38,44,36 \n\t" + + "xvadddp 48,48,49 \n\t" + + "xvmaddadp 41,46,36 \n\t" + + "stxvp 38, 0(%[y]) \n\t" + "xvmaddadp 40,48,36 \n\t" + "stxvp 40, 32(%[y]) \n\t" + + : [memy] "+m" (*(double (*)[8])y), + [n] "+&r" (n), + [a0] "=b" (a0), + [a1] "=&b" (a1), + [a2] "=&b" (a2), + [a3] "=&b" (a3), + [a4] "=&b" (a4), + [a5] "=&b" (a5), + [a6] "=&b" (a6), + [a7] "=&b" (a7), + [off] "+&b" (lda), + [off2]"=&b" (off2), + [temp] "=&b" (tempR) + : [memx] "m" (*(const double (*)[n])x), + [mem_ap] "m" (*(const double (*)[n*8]) ap), + [alpha] "d" (alpha), + "[a0]" (ap), + [x] "b" (x), + [y] "b" (y) + : "cc","vs4","vs5","vs6","vs7","vs8","vs9" ,"vs32","vs33","vs34","vs35", "vs36", "vs37", "vs38", "vs39", + "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51" + ); + return; +} +#else +static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + BLASLONG i; +#if defined(PREFETCH) + BLASLONG j, c, k; +#endif + FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; + __vector double *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; + register __vector double temp0 = {0, 0}; + register __vector double temp1 = {0, 0}; + register __vector double temp2 = {0, 0}; + register __vector double temp3 = {0, 0}; + register __vector double temp4 = {0, 0}; + register __vector double temp5 = {0, 0}; + register __vector double temp6 = {0, 0}; + register __vector double temp7 = {0, 0}; + + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + a4 = a3 + lda; + a5 = a4 + lda; + a6 = a5 + lda; + a7 = a6 + lda; + va0 = (__vector double*) a0; + va1 = (__vector double*) a1; + va2 = (__vector double*) a2; + va3 = (__vector double*) a3; + va4 = (__vector double*) a4; + va5 = (__vector double*) a5; + va6 = (__vector double*) a6; + va7 = (__vector double*) a7; + v_x = (__vector double*) x; + +#if defined(PREFETCH) + + c = n >> 1; + + for (j = 0; j < c; j += 64) { + k = (c - j) > 64 ? 64 : (c - j); + __builtin_prefetch(v_x + 64); + __builtin_prefetch(va0 + 64); + __builtin_prefetch(va1 + 64); + __builtin_prefetch(va2 + 64); + __builtin_prefetch(va3 + 64); + __builtin_prefetch(va4 + 64); + __builtin_prefetch(va5 + 64); + __builtin_prefetch(va6 + 64); + __builtin_prefetch(va7 + 64); + for (i = 0; i < k; i += 2) { +#else + + for (i = 0; i < n/2; i += 2) { +#endif + temp0 += v_x[i] * va0[i]; + temp1 += v_x[i] * va1[i]; + temp2 += v_x[i] * va2[i]; + temp3 += v_x[i] * va3[i]; + temp4 += v_x[i] * va4[i]; + temp5 += v_x[i] * va5[i]; + temp6 += v_x[i] * va6[i]; + temp7 += v_x[i] * va7[i]; + temp0 += v_x[i + 1] * va0[i + 1]; + temp1 += v_x[i + 1] * va1[i + 1]; + temp2 += v_x[i + 1] * va2[i + 1]; + temp3 += v_x[i + 1] * va3[i + 1]; + + temp4 += v_x[i + 1] * va4[i + 1]; + temp5 += v_x[i + 1] * va5[i + 1]; + temp6 += v_x[i + 1] * va6[i + 1]; + temp7 += v_x[i + 1] * va7[i + 1]; + } +#if defined(PREFETCH) + va0 += 64; + va1 += 64; + va2 += 64; + va3 += 64; + va4 += 64; + va5 += 64; + va6 += 64; + va7 += 64; + v_x += 64; + + } +#endif + y[0] += alpha * (temp0[0] + temp0[1]); + y[1] += alpha * (temp1[0] + temp1[1]); + y[2] += alpha * (temp2[0] + temp2[1]); + y[3] += alpha * (temp3[0] + temp3[1]); + + y[4] += alpha * (temp4[0] + temp4[1]); + y[5] += alpha * (temp5[0] + temp5[1]); + y[6] += alpha * (temp6[0] + temp6[1]); + y[7] += alpha * (temp7[0] + temp7[1]); + +} + +#endif + + +static void dgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + BLASLONG i = 0; + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + __vector double* va0 = (__vector double*) a0; + __vector double* va1 = (__vector double*) a1; + __vector double* va2 = (__vector double*) a2; + __vector double* va3 = (__vector double*) a3; + __vector double* v_x = (__vector double*) x; + register __vector double temp0 = {0, 0}; + register __vector double temp1 = {0, 0}; + register __vector double temp2 = {0, 0}; + register __vector double temp3 = {0, 0}; + register __vector double temp4 = {0, 0}; + register __vector double temp5 = {0, 0}; + register __vector double temp6 = {0, 0}; + register __vector double temp7 = {0, 0}; + + for (i = 0; i < n / 2; i += 2) { + temp0 += v_x[i] * va0[i]; + temp1 += v_x[i] * va1[i]; + temp2 += v_x[i] * va2[i]; + temp3 += v_x[i] * va3[i]; + temp4 += v_x[i + 1] * va0[i + 1]; + temp5 += v_x[i + 1] * va1[i + 1]; + temp6 += v_x[i + 1] * va2[i + 1]; + temp7 += v_x[i + 1] * va3[i + 1]; + } + + temp0 += temp4; + temp1 += temp5; + temp2 += temp6; + temp3 += temp7; + y[0] += alpha * (temp0[0] + temp0[1]); + y[1] += alpha * (temp1[0] + temp1[1]); + y[2] += alpha * (temp2[0] + temp2[1]); + y[3] += alpha * (temp3[0] + temp3[1]); + +} + + +static void dgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) { + + BLASLONG i; + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + __vector double* va0 = (__vector double*) a0; + __vector double* va1 = (__vector double*) a1; + __vector double* v_x = (__vector double*) x; + __vector double temp0 = {0, 0}; + __vector double temp1 = {0, 0}; + for (i = 0; i < n / 2; i += 2) { + temp0 += v_x[i] * va0[i] + v_x[i + 1] * va0[i + 1]; + temp1 += v_x[i] * va1[i] + v_x[i + 1] * va1[i + 1]; + } + + + + y[0] += alpha * (temp0[0] + temp0[1]); + y[inc_y] += alpha * (temp1[0] + temp1[1]); +} + +static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + + BLASLONG i; + FLOAT *a0; + a0 = ap; + __vector double* va0 = (__vector double*) a0; + __vector double* v_x = (__vector double*) x; + __vector double temp0 = {0, 0}; + for (i = 0; i < n / 2; i += 2) { + temp0 += v_x[i] * va0[i] + v_x[i + 1] * va0[i + 1]; + } + + *y += alpha * (temp0[0] + temp0[1]); + +} + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { + BLASLONG i; + for (i = 0; i < n; i++) { + *dest++ = *src; + src += inc_src; + } +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { + BLASLONG i; + BLASLONG j; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + FLOAT ybuffer[8] __attribute__((aligned(16))); + FLOAT *xbuffer; + + if (m < 1) return (0); + if (n < 1) return (0); + + xbuffer = buffer; + + n1 = n >> 3; + n2 = n & 7; + + m3 = m & 3; + m1 = m - m3; + m2 = (m & (NBMAX - 1)) - m3; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + + if (inc_x != 1) + copy_x(NB, x_ptr, xbuffer, inc_x); + else + xbuffer = x_ptr; + + BLASLONG lda8 = lda << 3; + + + if (inc_y == 1) { + + for (i = 0; i < n1; i++) { + + dgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha); + + y_ptr += 8; + a_ptr += lda8; +#if defined(PREFETCH) + __builtin_prefetch(y_ptr+64); +#endif + } + + } else { + + for (i = 0; i < n1; i++) { + ybuffer[0] = 0; + ybuffer[1] = 0; + ybuffer[2] = 0; + ybuffer[3] = 0; + ybuffer[4] = 0; + ybuffer[5] = 0; + ybuffer[6] = 0; + ybuffer[7] = 0; + dgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha); + + + + *y_ptr += ybuffer[0]; + y_ptr += inc_y; + *y_ptr += ybuffer[1]; + y_ptr += inc_y; + *y_ptr += ybuffer[2]; + y_ptr += inc_y; + *y_ptr += ybuffer[3]; + y_ptr += inc_y; + + *y_ptr += ybuffer[4]; + y_ptr += inc_y; + *y_ptr += ybuffer[5]; + y_ptr += inc_y; + *y_ptr += ybuffer[6]; + y_ptr += inc_y; + *y_ptr += ybuffer[7]; + y_ptr += inc_y; + + a_ptr += lda8; + } + + } + + + if (n2 & 4) { + ybuffer[0] = 0; + ybuffer[1] = 0; + ybuffer[2] = 0; + ybuffer[3] = 0; + dgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha); + + a_ptr += lda<<2; + + *y_ptr += ybuffer[0]; + y_ptr += inc_y; + *y_ptr += ybuffer[1]; + y_ptr += inc_y; + *y_ptr += ybuffer[2]; + y_ptr += inc_y; + *y_ptr += ybuffer[3]; + y_ptr += inc_y; + } + + if (n2 & 2) { + dgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y); + a_ptr += lda << 1; + y_ptr += 2 * inc_y; + + } + + if (n2 & 1) { + dgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha); + a_ptr += lda; + y_ptr += inc_y; + + } + + a += NB; + x += NB * inc_x; + + + } + + if (m3 == 0) return (0); + + x_ptr = x; + a_ptr = a; + if (m3 == 3) { + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp2 = *x_ptr * alpha; + + FLOAT *aj = a_ptr; + y_ptr = y; + + if (lda == 3 && inc_y == 1) { + + for (j = 0; j < (n & -4); j += 4) { + + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; + y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; + y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; + y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; + aj += 12; + } + + for (; j < n; j++) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; + aj += 3; + } + + } else { + + if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + + for (j = 0; j < (n & -4); j += 4) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; + y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2; + y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2; + y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2; + aj += lda4; + } + + for (; j < n; j++) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; + aj += lda; + } + + } else { + + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; + y_ptr += inc_y; + aj += lda; + } + + } + + } + return (0); + } + + if (m3 == 2) { + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + + FLOAT *aj = a_ptr; + y_ptr = y; + + if (lda == 2 && inc_y == 1) { + + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; + y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1; + y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1; + y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1; + aj += 8; + + } + + for (; j < n; j++) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; + aj += 2; + } + + } else { + if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + + for (j = 0; j < (n & -4); j += 4) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; + y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; + y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; + y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; + aj += lda4; + } + + for (; j < n; j++) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; + aj += lda; + } + + } else { + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1; + y_ptr += inc_y; + aj += lda; + } + } + + } + return (0); + + } + + FLOAT xtemp = *x_ptr * alpha; + FLOAT *aj = a_ptr; + y_ptr = y; + if (lda == 1 && inc_y == 1) { + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += aj[j] * xtemp; + y_ptr[j + 1] += aj[j + 1] * xtemp; + y_ptr[j + 2] += aj[j + 2] * xtemp; + y_ptr[j + 3] += aj[j + 3] * xtemp; + } + for (; j < n; j++) { + y_ptr[j] += aj[j] * xtemp; + } + + + } else { + if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += *aj * xtemp; + y_ptr[j + 1] += *(aj + lda) * xtemp; + y_ptr[j + 2] += *(aj + lda2) * xtemp; + y_ptr[j + 3] += *(aj + lda3) * xtemp; + aj += lda4; + } + + for (; j < n; j++) { + y_ptr[j] += *aj * xtemp; + aj += lda; + } + + } else { + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp; + y_ptr += inc_y; + aj += lda; + } + + } + } + + return (0); + +} + diff --git a/kernel/power/drot.c b/kernel/power/drot.c index baeb54205..951c2f9c9 100644 --- a/kernel/power/drot.c +++ b/kernel/power/drot.c @@ -39,9 +39,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma GCC optimize "O1" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "drot_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_16 diff --git a/kernel/power/dscal.c b/kernel/power/dscal.c index 779a08e9c..39293252b 100644 --- a/kernel/power/dscal.c +++ b/kernel/power/dscal.c @@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "dscal_microk_power8.c" #endif +#endif #if !defined(HAVE_KERNEL_8) diff --git a/kernel/power/dswap.c b/kernel/power/dswap.c index 52b7f50da..ff3f95c79 100644 --- a/kernel/power/dswap.c +++ b/kernel/power/dswap.c @@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "dswap_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_32 diff --git a/kernel/power/idamax.c b/kernel/power/idamax.c index 195a8c68e..5016f67dd 100644 --- a/kernel/power/idamax.c +++ b/kernel/power/idamax.c @@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #include +#if defined(__VEC__) || defined(__ALTIVEC__) #include +#endif + #if defined(DOUBLE) #define ABS fabs @@ -37,6 +40,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#if defined(__VEC__) || defined(__ALTIVEC__) + /** * Find maximum index * Warning: requirements n>0 and n % 32 == 0 @@ -313,6 +318,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { return index; } +#endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i = 0; @@ -326,12 +332,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG n1 = n & -32; #if defined(_CALL_ELF) && (_CALL_ELF == 2) +#if defined(__VEC__) || defined(__ALTIVEC__) + if (n1 > 0) { max = diamax_kernel_32(n1, x, &maxf); i = n1; } +#endif #endif while (i < n) { if (ABS(x[i]) > maxf) { diff --git a/kernel/power/idamin.c b/kernel/power/idamin.c index 8a5538821..e37718c48 100644 --- a/kernel/power/idamin.c +++ b/kernel/power/idamin.c @@ -37,6 +37,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#if defined(__VEC__) || defined(__ALTIVEC__) + /** * Find minimum index * Warning: requirements n>0 and n % 32 == 0 @@ -313,7 +315,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { return index; } - +#endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { @@ -327,12 +329,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { if (inc_x == 1) { #if defined(_CALL_ELF) && (_CALL_ELF == 2) +#if defined(__VEC__) || defined(__ALTIVEC__) + BLASLONG n1 = n & -32; if (n1 > 0) { min = diamin_kernel_32(n1, x, &minf); i = n1; } +#endif #endif while (i < n) { if (ABS(x[i]) < minf) { diff --git a/kernel/power/izamax.c b/kernel/power/izamax.c index 7149da28b..fe9d5bf95 100644 --- a/kernel/power/izamax.c +++ b/kernel/power/izamax.c @@ -34,6 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#if defined(__VEC__) || defined(__ALTIVEC__) /** * Find maximum index @@ -299,7 +300,7 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) { } - +#endif @@ -317,6 +318,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (inc_x == 1) { #if defined(_CALL_ELF) && (_CALL_ELF == 2) +#if defined(__VEC__) || defined(__ALTIVEC__) + BLASLONG n1 = n & -16; if (n1 > 0) { @@ -324,6 +327,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) i = n1; ix = n1 << 1; } +#endif #endif while(i < n) diff --git a/kernel/power/izamin.c b/kernel/power/izamin.c index 692315b89..94f2383e0 100644 --- a/kernel/power/izamin.c +++ b/kernel/power/izamin.c @@ -24,7 +24,6 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" #include @@ -32,6 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ABS fabs #define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) +#if defined(__VEC__) || defined(__ALTIVEC__) /** * Find minimum index @@ -296,6 +296,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { return index; } +#endif @@ -316,6 +317,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) minf = CABS1(x,0); //index will not be incremented #if defined(_CALL_ELF) && (_CALL_ELF == 2) +#if defined(__VEC__) || defined(__ALTIVEC__) + BLASLONG n1 = n & -16; if (n1 > 0) { @@ -323,6 +326,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) i = n1; ix = n1 << 1; } +#endif #endif while(i < n) @@ -359,5 +363,3 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } } - - diff --git a/kernel/power/sasum.c b/kernel/power/sasum.c index 5908347d3..733137012 100644 --- a/kernel/power/sasum.c +++ b/kernel/power/sasum.c @@ -46,9 +46,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "sasum_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_32 diff --git a/kernel/power/saxpy.c b/kernel/power/saxpy.c index 393cdfadc..3d3b1613c 100644 --- a/kernel/power/saxpy.c +++ b/kernel/power/saxpy.c @@ -28,8 +28,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" +#define offset_0 0 +#define offset_1 16 +#define offset_2 32 +#define offset_3 48 +#define offset_4 64 +#define offset_5 80 +#define offset_6 96 +#define offset_7 112 +#define offset_8 128 +#define offset_9 144 +#define offset_10 160 +#define offset_11 176 +#define offset_12 192 +#define offset_13 208 +#define offset_14 224 +#define offset_15 240 +#if defined(__VEC__) || defined(__ALTIVEC__) #ifndef HAVE_KERNEL_8 #include @@ -37,12 +54,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha) { BLASLONG i = 0; - __vector float v_a = {alpha,alpha,alpha,alpha}; - __vector float * v_y=(__vector float *)y; - __vector float * v_x=(__vector float *)x; + __vector float v_a __attribute((aligned(16))) = {alpha,alpha,alpha,alpha}; + __vector float * vptr_y =(__vector float *)y; + __vector float * vptr_x =(__vector float *)x; for(; i + +typedef __vector unsigned char vec_t; +typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); +typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); +#if defined(TRMMKERNEL) +#define SAVE_ACC(ACC, J) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] = result[0] * alpha; \ + rowC = (v4sf_t *) &CO[1*ldc+J]; \ + rowC[0] = result[1] * alpha; \ + rowC = (v4sf_t *) &CO[2*ldc+J]; \ + rowC[0] = result[2] * alpha; \ + rowC = (v4sf_t *) &CO[3*ldc+J]; \ + rowC[0] = result[3] * alpha; +#define SAVE_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rowC = (v4sf_t *) &CO[4* ldc+J]; \ + rowC[0] = result[0] * alpha; \ + rowC = (v4sf_t *) &CO[5*ldc+J]; \ + rowC[0] = result[1] * alpha; \ + rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC[0] = result[2] * alpha; \ + rowC = (v4sf_t *) &CO[7*ldc+J]; \ + rowC[0] = result[3] * alpha; +#define SAVE4x2_ACC(ACC, J) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rowC = (v2sf_t *) &CO[0* ldc+J]; \ + rowC[0] = result[0] * alpha; \ + rowC = (v2sf_t *) &CO[1* ldc+J]; \ + rowC[0] = result[2] * alpha; \ + rowC = (v2sf_t *) &CO[2* ldc+J]; \ + rowC[0] = result[4] * alpha; \ + rowC = (v2sf_t *) &CO[3* ldc+J]; \ + rowC[0] = result[6] * alpha; +#define SAVE4x2_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rowC = (v2sf_t *) &CO[4* ldc+J]; \ + rowC[0] = result[0] * alpha; \ + rowC = (v2sf_t *) &CO[5* ldc+J]; \ + rowC[0] = result[2] * alpha; \ + rowC = (v2sf_t *) &CO[6* ldc+J]; \ + rowC[0] = result[4] * alpha; \ + rowC = (v2sf_t *) &CO[7* ldc+J]; \ + rowC[0] = result[6] * alpha; +#define SAVE2x4_ACC(ACC, J) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] = result[0] * alpha; \ + rowC = (v4sf_t *) &CO[1* ldc+J]; \ + rowC[0] = result[1] * alpha; +#else +#define SAVE_ACC(ACC, J) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] += result[0] * alpha; \ + rowC = (v4sf_t *) &CO[1*ldc+J]; \ + rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[2*ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v4sf_t *) &CO[3*ldc+J]; \ + rowC[0] += result[3] * alpha; +#define SAVE_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rowC = (v4sf_t *) &CO[4* ldc+J]; \ + rowC[0] += result[0] * alpha; \ + rowC = (v4sf_t *) &CO[5*ldc+J]; \ + rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v4sf_t *) &CO[7*ldc+J]; \ + rowC[0] += result[3] * alpha; +#define SAVE4x2_ACC(ACC, J) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rowC = (v2sf_t *) &CO[0* ldc+J]; \ + rowC[0] += result[0] * alpha; \ + rowC = (v2sf_t *) &CO[1* ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v2sf_t *) &CO[2* ldc+J]; \ + rowC[0] += result[4] * alpha; \ + rowC = (v2sf_t *) &CO[3* ldc+J]; \ + rowC[0] += result[6] * alpha; +#define SAVE4x2_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rowC = (v2sf_t *) &CO[4* ldc+J]; \ + rowC[0] += result[0] * alpha; \ + rowC = (v2sf_t *) &CO[5* ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v2sf_t *) &CO[6* ldc+J]; \ + rowC[0] += result[4] * alpha; \ + rowC = (v2sf_t *) &CO[7* ldc+J]; \ + rowC[0] += result[6] * alpha; +#define SAVE2x4_ACC(ACC, J) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] += result[0] * alpha; \ + rowC = (v4sf_t *) &CO[1* ldc+J]; \ + rowC[0] += result[1] * alpha; +#endif +#define KERNEL(i, j) \ + __builtin_mma_xvf32gerpp (&acc0, rowB[i], rowA[j]); \ + __builtin_mma_xvf32gerpp (&acc1, rowB[i+1], rowA[j]); \ + __builtin_mma_xvf32gerpp (&acc2, rowB[i], rowA[j+1]); \ + __builtin_mma_xvf32gerpp (&acc3, rowB[i+1], rowA[j+1]); \ + __builtin_mma_xvf32gerpp (&acc4, rowB[i], rowA[j+2]); \ + __builtin_mma_xvf32gerpp (&acc5, rowB[i+1], rowA[j+2]); \ + __builtin_mma_xvf32gerpp (&acc6, rowB[i], rowA[j+3]); \ + __builtin_mma_xvf32gerpp (&acc7, rowB[i+1], rowA[j+3]); +#define SET_ACC_ZERO4() \ + __builtin_mma_xxsetaccz (&acc0); \ + __builtin_mma_xxsetaccz (&acc1); \ + __builtin_mma_xxsetaccz (&acc2); \ + __builtin_mma_xxsetaccz (&acc3); + +#define SET_ACC_ZERO8() \ + __builtin_mma_xxsetaccz (&acc0); \ + __builtin_mma_xxsetaccz (&acc1); \ + __builtin_mma_xxsetaccz (&acc2); \ + __builtin_mma_xxsetaccz (&acc3); \ + __builtin_mma_xxsetaccz (&acc4); \ + __builtin_mma_xxsetaccz (&acc5); \ + __builtin_mma_xxsetaccz (&acc6); \ + __builtin_mma_xxsetaccz (&acc7); + +#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +#define REFRESH_TEMP_BK(x, y) \ + temp = k - off; +#elif defined(LEFT) +#define REFRESH_TEMP_BK(x, y) \ + temp = off + x; +#else +#define REFRESH_TEMP_BK(x, y) \ + temp = off + y; +#endif +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +#define REFRESH_POINTERS(x, y) \ + BO = B; \ + REFRESH_TEMP_BK(x, y) +#else +#define REFRESH_POINTERS(x, y) \ + AO += off * x; \ + BO = B + off * y; \ + REFRESH_TEMP_BK(x, y) +#endif + +#ifdef LEFT +#define REFRESH_OFF(x) \ + off += x; +#else +#define REFRESH_OFF(x) +#endif + +#ifdef LEFT +#define UPDATE_TEMP(x, y) \ + temp -= x; +#else +#define UPDATE_TEMP(x, y) \ + temp -= y; +#endif + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +#define REFRESH_TMP_AFTER_SAVE(x, y) \ + temp = k - off; \ + UPDATE_TEMP(x, y) \ + AO += temp * x; \ + BO += temp * y; +#else +#define REFRESH_TMP_AFTER_SAVE(x, y) +#endif + +#define REFRESH_AFTER_SAVE(x,y) \ + REFRESH_TMP_AFTER_SAVE(x, y) \ + REFRESH_OFF(x) +/************************************************************************************* +* GEMM Kernel +*************************************************************************************/ +int +CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, + FLOAT * C, BLASLONG ldc +#ifdef TRMMKERNEL + , BLASLONG offset +#endif + ) +{ + BLASLONG N = n; + BLASLONG i1; +#if defined(TRMMKERNEL) + BLASLONG off; +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#endif + + v4sf_t valpha = { alpha, alpha, alpha, alpha }; + N = n >> 3; + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i, j, temp; + FLOAT *CO; + FLOAT *AO; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + CO = C; + C += ldc << 3; + AO = A; + PREFETCH1 (A, 128); + PREFETCH1 (A, 256); + i = m >> 4; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (16, 8); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + SET_ACC_ZERO8 (); + BLASLONG l = 0; + BLASLONG K = temp / 64; + for (l = 0; l < K; l++) + { + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + KERNEL (0, 0); + KERNEL (2, 4); + KERNEL (4, 8); + KERNEL (6, 12); + KERNEL (8, 16); + KERNEL (10, 20); + KERNEL (12, 24); + KERNEL (14, 28); + KERNEL (16, 32); + KERNEL (18, 36); + KERNEL (20, 40); + KERNEL (22, 44); + KERNEL (24, 48); + KERNEL (26, 52); + KERNEL (28, 56); + KERNEL (30, 60); + KERNEL (32, 64); + KERNEL (34, 68); + KERNEL (36, 72); + KERNEL (38, 76); + KERNEL (40, 80); + KERNEL (42, 84); + KERNEL (44, 88); + KERNEL (46, 92); + KERNEL (48, 96); + KERNEL (50, 100); + KERNEL (52, 104); + KERNEL (54, 108); + KERNEL (56, 112); + KERNEL (58, 116); + KERNEL (60, 120); + KERNEL (62, 124); + KERNEL (64, 128); + KERNEL (66, 132); + KERNEL (68, 136); + KERNEL (70, 140); + KERNEL (72, 144); + KERNEL (74, 148); + KERNEL (76, 152); + KERNEL (78, 156); + KERNEL (80, 160); + KERNEL (82, 164); + KERNEL (84, 168); + KERNEL (86, 172); + KERNEL (88, 176); + KERNEL (90, 180); + KERNEL (92, 184); + KERNEL (94, 188); + KERNEL (96, 192); + KERNEL (98, 196); + KERNEL (100, 200); + KERNEL (102, 204); + KERNEL (104, 208); + KERNEL (106, 212); + KERNEL (108, 216); + KERNEL (110, 220); + KERNEL (112, 224); + KERNEL (114, 228); + KERNEL (116, 232); + KERNEL (118, 236); + KERNEL (120, 240); + KERNEL (122, 244); + KERNEL (124, 248); + KERNEL (126, 252); + AO += 1024; + BO += 512; + } + if ((temp & 63) >> 5) + { + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + KERNEL (0, 0); + KERNEL (2, 4); + KERNEL (4, 8); + KERNEL (6, 12); + KERNEL (8, 16); + KERNEL (10, 20); + KERNEL (12, 24); + KERNEL (14, 28); + KERNEL (16, 32); + KERNEL (18, 36); + KERNEL (20, 40); + KERNEL (22, 44); + KERNEL (24, 48); + KERNEL (26, 52); + KERNEL (28, 56); + KERNEL (30, 60); + KERNEL (32, 64); + KERNEL (34, 68); + KERNEL (36, 72); + KERNEL (38, 76); + KERNEL (40, 80); + KERNEL (42, 84); + KERNEL (44, 88); + KERNEL (46, 92); + KERNEL (48, 96); + KERNEL (50, 100); + KERNEL (52, 104); + KERNEL (54, 108); + KERNEL (56, 112); + KERNEL (58, 116); + KERNEL (60, 120); + KERNEL (62, 124); + AO += 512; + BO += 256; + } + if ((temp & 31) >> 4) + { + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + KERNEL (0, 0); + KERNEL (2, 4); + KERNEL (4, 8); + KERNEL (6, 12); + KERNEL (8, 16); + KERNEL (10, 20); + KERNEL (12, 24); + KERNEL (14, 28); + KERNEL (16, 32); + KERNEL (18, 36); + KERNEL (20, 40); + KERNEL (22, 44); + KERNEL (24, 48); + KERNEL (26, 52); + KERNEL (28, 56); + KERNEL (30, 60); + AO += 256; + BO += 128; + } + if ((temp & 15) >> 3) + { + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + KERNEL (0, 0); + KERNEL (2, 4); + KERNEL (4, 8); + KERNEL (6, 12); + KERNEL (8, 16); + KERNEL (10, 20); + KERNEL (12, 24); + KERNEL (14, 28); + AO += 128; + BO += 64; + } + if ((temp & 7) >> 2) + { + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + KERNEL (0, 0); + KERNEL (2, 4); + KERNEL (4, 8); + KERNEL (6, 12); + AO += 64; + BO += 32; + } + if ((temp & 3) >> 1) + { + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + KERNEL (0, 0); + KERNEL (2, 4); + AO += 32; + BO += 16; + } + if ((temp & 1) >> 0) + { + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + KERNEL (0, 0); + AO += 16; + BO += 8; + } + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc2, 4); + SAVE_ACC1 (&acc1, 0); + SAVE_ACC1 (&acc3, 4); + SAVE_ACC (&acc4, 8); + SAVE_ACC (&acc6, 12); + SAVE_ACC1 (&acc5, 8); + SAVE_ACC1 (&acc7, 12); +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (16, 8) +#endif + CO += 16; + } + i = (m & 15) >> 3; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 8); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3; + SET_ACC_ZERO4 (); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 3]; + vec_t *rowB = (vec_t *) & BO[l << 3]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]); + __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[1]); + __builtin_mma_xvf32gerpp (&acc3, rowB[1], rowA[1]); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc2, 4); + SAVE_ACC1 (&acc1, 0); + SAVE_ACC1 (&acc3, 4); + AO += (temp << 3); + BO += (temp << 3); + CO += 8; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 8) +#endif + } + i = (m & 7) >> 2; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 8); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 2]; + vec_t *rowB = (vec_t *) & BO[l << 3]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC1 (&acc1, 0); + CO += 4; + AO += (temp << 2); + BO += (temp << 3); +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 8) +#endif + } + i = (m & 3) >> 1; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 8); +#else + BO = B; + temp = k; +#endif + + v2sf_t *rowC; + v2sf_t result[8]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + FLOAT t[4] = { 0 }; + t[0] = AO[l << 1], t[1] = AO[(l << 1) + 1]; + vec_t *rowA = (vec_t *) & t[0]; + vec_t *rowB = (vec_t *) & BO[l << 3]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]); + } + SAVE4x2_ACC (&acc0, 0); + SAVE4x2_ACC1 (&acc1, 0); + CO += 2; + AO += (temp << 1); + BO += (temp << 3); +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 8) +#endif + } + i = (m & 1) >> 0; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 8); +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; + v4sf_t t1 = { 0, 0, 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowA = { AO[l], AO[l], AO[l], AO[l] }; + v4sf_t rowB = { BO[l << 3], BO[(l << 3) + 1], BO[(l << 3) + 2], + BO[(l << 3) + 3] + }; + v4sf_t rowB1 = + { BO[(l << 3) + 4], BO[(l << 3) + 5], BO[(l << 3) + 6], + BO[(l << 3) + 7] + }; + t += rowA * rowB; + t1 += rowA * rowB1; + } + t = t * valpha; + t1 = t1 * valpha; +#if defined(TRMMKERNEL) + CO[0 * ldc] = t[0]; + CO[1 * ldc] = t[1]; + CO[2 * ldc] = t[2]; + CO[3 * ldc] = t[3]; + CO[4 * ldc] = t1[0]; + CO[5 * ldc] = t1[1]; + CO[6 * ldc] = t1[2]; + CO[7 * ldc] = t1[3]; +#else + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; + CO[2 * ldc] += t[2]; + CO[3 * ldc] += t[3]; + CO[4 * ldc] += t1[0]; + CO[5 * ldc] += t1[1]; + CO[6 * ldc] += t1[2]; + CO[7 * ldc] += t1[3]; +#endif + CO += 1; + AO += temp; + BO += (temp << 3); +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 8) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 8; // number of values in A +#endif + + B += k << 3; + } + N = (n & 7) >> 2; + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i, j, temp; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + FLOAT *CO; + FLOAT *AO; + CO = C; + C += ldc << 2; + AO = A; +#if !defined(TRMMKERNEL) + i = m >> 5; + for (j = 0; j < i; j++) + { + FLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + FLOAT *A1; + A1 = AO + (16 * k); + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + SET_ACC_ZERO8 (); + BLASLONG l = 0; + for (l = 0; l < k; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 4]; + vec_t *rowA1 = (vec_t *) & A1[l << 4]; + vec_t *rowB = (vec_t *) & BO[l << 2]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); + __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]); + __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]); + __builtin_mma_xvf32gerpp (&acc4, rowB[0], rowA1[0]); + __builtin_mma_xvf32gerpp (&acc5, rowB[0], rowA1[1]); + __builtin_mma_xvf32gerpp (&acc6, rowB[0], rowA1[2]); + __builtin_mma_xvf32gerpp (&acc7, rowB[0], rowA1[3]); + } + + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc1, 4); + CO += 8; + SAVE_ACC (&acc2, 0); + SAVE_ACC (&acc3, 4); + CO += 8; + SAVE_ACC (&acc4, 0); + SAVE_ACC (&acc5, 4); + CO += 8; + SAVE_ACC (&acc6, 0); + SAVE_ACC (&acc7, 4); + CO += 8; + AO += k << 5; + BO += k << 2; + } + i = (m & 31) >> 4; +#else + i = m >> 4; +#endif + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (16, 4); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3; + SET_ACC_ZERO4 (); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 4]; + vec_t *rowB = (vec_t *) & BO[l << 2]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); + __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]); + __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]); + } + + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc1, 4); + CO += 8; + SAVE_ACC (&acc2, 0); + SAVE_ACC (&acc3, 4); + CO += 8; + AO += temp << 4; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (16, 4) +#endif + } + i = (m & 15) >> 3; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 4); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 3]; + vec_t *rowB = (vec_t *) & BO[l << 2]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc1, 4); + CO += 8; + AO += temp << 3; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 4) +#endif + } + i = (m & 7) >> 2; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 4); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + __vector_quad acc0; + v4sf_t result[4]; + __builtin_mma_xxsetaccz (&acc0); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 2]; + vec_t *rowB = (vec_t *) & BO[l << 2]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + } + SAVE_ACC (&acc0, 0); + CO += 4; + AO += temp << 2; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 4) +#endif + } + i = (m & 3) >> 1; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 4); +#else + BO = B; + temp = k; +#endif + v2sf_t *rowC; + v2sf_t result[8]; + __vector_quad acc0; + __builtin_mma_xxsetaccz (&acc0); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + FLOAT t[4] = { 0 }; + t[0] = AO[l << 1], t[1] = AO[(l << 1) + 1]; + vec_t *rowA = (vec_t *) & t[0]; + vec_t *rowB = (vec_t *) & BO[l << 2]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + } + SAVE4x2_ACC (&acc0, 0); + CO += 2; + AO += temp << 1; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 4) +#endif + } + i = (m & 1) >> 0; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 4) +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowA = { AO[l], AO[l], AO[l], AO[l] }; + v4sf_t rowB = { BO[l << 2], BO[(l << 2) + 1], BO[(l << 2) + 2], + BO[(l << 2) + 3] + }; + t += rowA * rowB; + } + t = t * valpha; +#if defined(TRMMKERNEL) + CO[0 * ldc] = t[0]; + CO[1 * ldc] = t[1]; + CO[2 * ldc] = t[2]; + CO[3 * ldc] = t[3]; +#else + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; + CO[2 * ldc] += t[2]; + CO[3 * ldc] += t[3]; +#endif + CO += 1; + AO += temp; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 4) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 4; // number of values in A +#endif + + B += k << 2; + } + N = (n & 3) >> 1; + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i, j, temp; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + FLOAT *CO; + FLOAT *AO; + CO = C; + C += ldc << 1; + AO = A; +#if !defined(TRMMKERNEL) + i = m >> 5; + for (j = 0; j < i; j++) + { + FLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + FLOAT *A1; + A1 = AO + (16 * k); + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + SET_ACC_ZERO8 (); + BLASLONG l = 0; + for (l = 0; l < k; l++) + { + FLOAT t[4] = { 0 }; + t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; + vec_t *rowB = (vec_t *) & t[0]; + vec_t *rowA = (vec_t *) & AO[l << 4]; + vec_t *rowA1 = (vec_t *) & A1[l << 4]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); + __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]); + __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]); + __builtin_mma_xvf32gerpp (&acc4, rowB[0], rowA1[0]); + __builtin_mma_xvf32gerpp (&acc5, rowB[0], rowA1[1]); + __builtin_mma_xvf32gerpp (&acc6, rowB[0], rowA1[2]); + __builtin_mma_xvf32gerpp (&acc7, rowB[0], rowA1[3]); + } + SAVE2x4_ACC (&acc0, 0); + SAVE2x4_ACC (&acc1, 4); + SAVE2x4_ACC (&acc2, 8); + SAVE2x4_ACC (&acc3, 12); + CO += 16; + SAVE2x4_ACC (&acc4, 0); + SAVE2x4_ACC (&acc5, 4); + SAVE2x4_ACC (&acc6, 8); + SAVE2x4_ACC (&acc7, 12); + CO += 16; + AO += k << 5; + BO += k << 1; + } + i = (m & 31) >> 4; +#else + i = m >> 4; +#endif + for (j = 0; j < i; j++) + { + FLOAT *BO; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3; + SET_ACC_ZERO4 (); + BLASLONG l = 0; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (16, 2) +#else + BO = B; + temp = k; +#endif + for (l = 0; l < temp; l++) + { + FLOAT t[4] = { 0 }; + t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; + vec_t *rowB = (vec_t *) & t[0]; + vec_t *rowA = (vec_t *) & AO[l << 4]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); + __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]); + __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]); + } + SAVE2x4_ACC (&acc0, 0); + SAVE2x4_ACC (&acc1, 4); + SAVE2x4_ACC (&acc2, 8); + SAVE2x4_ACC (&acc3, 12); + CO += 16; + AO += temp << 4; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (16, 2) +#endif + } + i = (m & 15) >> 3; + for (j = 0; j < i; j++) + { + FLOAT *BO; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 2) +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + FLOAT t[4] = { 0 }; + t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; + vec_t *rowB = (vec_t *) & t[0]; + vec_t *rowA = (vec_t *) & AO[l << 3]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); + } + SAVE2x4_ACC (&acc0, 0); + SAVE2x4_ACC (&acc1, 4); + CO += 8; + AO += temp << 3; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 2) +#endif + } + i = (m & 7) >> 2; + for (j = 0; j < i; j++) + { + FLOAT *BO; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0; + __builtin_mma_xxsetaccz (&acc0); +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 2) +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + FLOAT t[4] = { 0 }; + t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; + vec_t *rowB = (vec_t *) & t[0]; + vec_t *rowA = (vec_t *) & AO[l << 2]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + } + SAVE2x4_ACC (&acc0, 0); + CO += 4; + AO += temp << 2; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 2) +#endif + } + i = (m & 3) >> 1; + for (j = 0; j < i; j++) + { + FLOAT *BO; + BLASLONG l = 0; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 2) +#else + BO = B; + temp = k; +#endif + v4sf_t t = { 0, 0, 0, 0 }; + for (l = 0; l < (temp << 1); l += 2) + { + v4sf_t rowA = { AO[l], AO[l], AO[l + 1], AO[l + 1] }; + v4sf_t rowB = { BO[l], BO[l + 1], BO[l], BO[l + 1] }; + t += rowA * rowB; + } + t = t * valpha; +#if defined(TRMMKERNEL) + CO[0 * ldc] = t[0]; + CO[1 * ldc] = t[1]; + CO[0 * ldc + 1] = t[2]; + CO[1 * ldc + 1] = t[3]; +#else + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; + CO[0 * ldc + 1] += t[2]; + CO[1 * ldc + 1] += t[3]; +#endif + CO += 2; + AO += temp << 1; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 2) +#endif + } + i = (m & 1) >> 0; + for (j = 0; j < i; j++) + { + FLOAT *BO; + BLASLONG l = 0; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 2) +#else + BO = B; + temp = k; +#endif + v4sf_t t = { 0, 0, 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowA = { AO[l], AO[l], 0, 0 }; + v4sf_t rowB = { BO[l << 1], BO[(l << 1) + 1], 0, 0 }; + t += rowA * rowB; + } + t = t * valpha; +#if defined(TRMMKERNEL) + CO[0 * ldc] = t[0]; + CO[1 * ldc] = t[1]; +#else + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; +#endif + CO += 1; + AO += temp; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 2) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; // number of values in A +#endif + + B += k << 1; + } + N = (n & 1) >> 0; + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i, temp; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + FLOAT *CO; + FLOAT *AO; + CO = C; + C += ldc; + AO = A; + i = m; + while (i >= 16) + { + FLOAT *BO; + BLASLONG l = 0; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (16, 1) +#else + BO = B; + temp = k; +#endif + + v4sf_t t = { 0, 0, 0, 0 }; + v4sf_t t1 = { 0, 0, 0, 0 }; + v4sf_t t2 = { 0, 0, 0, 0 }; + v4sf_t t3 = { 0, 0, 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowB = { BO[l], BO[l], BO[l], BO[l] }; + v4sf_t rowA = { AO[l << 4], AO[(l << 4) + 1], AO[(l << 4) + 2], + AO[(l << 4) + 3] + }; + v4sf_t rowA1 = + { AO[(l << 4) + 4], AO[(l << 4) + 5], AO[(l << 4) + 6], + AO[(l << 4) + 7] + }; + v4sf_t rowA2 = + { AO[(l << 4) + 8], AO[(l << 4) + 9], AO[(l << 4) + 10], + AO[(l << 4) + 11] + }; + v4sf_t rowA3 = + { AO[(l << 4) + 12], AO[(l << 4) + 13], AO[(l << 4) + 14], + AO[(l << 4) + 15] + }; + t += rowA * rowB; + t1 += rowA1 * rowB; + t2 += rowA2 * rowB; + t3 += rowA3 * rowB; + } + t = t * valpha; + t1 = t1 * valpha; + t2 = t2 * valpha; + t3 = t3 * valpha; +#if defined(TRMMKERNEL) + CO[0] = t[0]; + CO[1] = t[1]; + CO[2] = t[2]; + CO[3] = t[3]; + CO[4] = t1[0]; + CO[5] = t1[1]; + CO[6] = t1[2]; + CO[7] = t1[3]; + CO[8] = t2[0]; + CO[9] = t2[1]; + CO[10] = t2[2]; + CO[11] = t2[3]; + CO[12] = t3[0]; + CO[13] = t3[1]; + CO[14] = t3[2]; + CO[15] = t3[3]; +#else + CO[0] += t[0]; + CO[1] += t[1]; + CO[2] += t[2]; + CO[3] += t[3]; + CO[4] += t1[0]; + CO[5] += t1[1]; + CO[6] += t1[2]; + CO[7] += t1[3]; + CO[8] += t2[0]; + CO[9] += t2[1]; + CO[10] += t2[2]; + CO[11] += t2[3]; + CO[12] += t3[0]; + CO[13] += t3[1]; + CO[14] += t3[2]; + CO[15] += t3[3]; +#endif + AO += temp << 4; + BO += temp; + CO += 16; + i -= 16; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (16, 1) +#endif + } + while (i >= 8) + { + FLOAT *BO; + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; + v4sf_t t1 = { 0, 0, 0, 0 }; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 1) +#else + BO = B; + temp = k; +#endif + + for (l = 0; l < temp; l++) + { + v4sf_t rowB = { BO[l], BO[l], BO[l], BO[l] }; + v4sf_t rowA = { AO[l << 3], AO[(l << 3) + 1], AO[(l << 3) + 2], + AO[(l << 3) + 3] + }; + v4sf_t rowA1 = + { AO[(l << 3) + 4], AO[(l << 3) + 5], AO[(l << 3) + 6], + AO[(l << 3) + 7] + }; + t += rowA * rowB; + t1 += rowA1 * rowB; + } + t = t * valpha; + t1 = t1 * valpha; +#if defined(TRMMKERNEL) + CO[0] = t[0]; + CO[1] = t[1]; + CO[2] = t[2]; + CO[3] = t[3]; + CO[4] = t1[0]; + CO[5] = t1[1]; + CO[6] = t1[2]; + CO[7] = t1[3]; +#else + CO[0] += t[0]; + CO[1] += t[1]; + CO[2] += t[2]; + CO[3] += t[3]; + CO[4] += t1[0]; + CO[5] += t1[1]; + CO[6] += t1[2]; + CO[7] += t1[3]; +#endif + AO += temp << 3; + BO += temp; + CO += 8; + i -= 8; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 1) +#endif + } + while (i >= 4) + { + FLOAT *BO; + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 1) +#else + BO = B; + temp = k; +#endif + + for (l = 0; l < temp; l++) + { + v4sf_t rowB = { BO[l], BO[l], BO[l], BO[l] }; + v4sf_t rowA = { AO[l << 2], AO[(l << 2) + 1], AO[(l << 2) + 2], + AO[(l << 2) + 3] + }; + t += rowA * rowB; + } + t = t * valpha; +#if defined(TRMMKERNEL) + CO[0] = t[0]; + CO[1] = t[1]; + CO[2] = t[2]; + CO[3] = t[3]; +#else + CO[0] += t[0]; + CO[1] += t[1]; + CO[2] += t[2]; + CO[3] += t[3]; +#endif + AO += temp << 2; + BO += temp; + CO += 4; + i -= 4; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 1) +#endif + } + while (i >= 2) + { + FLOAT *BO; + BLASLONG l = 0; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 1) +#else + BO = B; + temp = k; +#endif + + v4sf_t t = { 0, 0, 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowB = { BO[l], BO[l], 0, 0 }; + v4sf_t rowA = { AO[l << 1], AO[(l << 1) + 1], 0, 0 }; + t += rowA * rowB; + } + t = t * valpha; +#if defined(TRMMKERNEL) + CO[0] = t[0]; + CO[1] = t[1]; +#else + CO[0] += t[0]; + CO[1] += t[1]; +#endif + AO += temp << 1; + BO += temp; + CO += 2; + i -= 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 1) +#endif + } + while (i >= 1) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 1) +#else + BO = B; + temp = k; +#endif + + BLASLONG l = 0; + FLOAT t = 0; + for (l = 0; l < temp; l++) + { + t += AO[l] * BO[l]; + } + AO += temp; + BO += temp; +#if defined(TRMMKERNEL) + CO[0] = t * alpha; +#else + CO[0] += t * alpha; +#endif + CO += 1; + i -= 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 1) +#endif + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; // number of values in A +#endif + B += k; + } + return 0; +} diff --git a/kernel/power/sgemv_n.c b/kernel/power/sgemv_n.c index 81ac031a3..5dfb18f5b 100644 --- a/kernel/power/sgemv_n.c +++ b/kernel/power/sgemv_n.c @@ -24,7 +24,10 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +#if !defined(__VEC__) || !defined(__ALTIVEC__) +#include "../arm/gemv_n.c" +#else #include "common.h" @@ -463,4 +466,5 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO return(0); } +#endif diff --git a/kernel/power/sgemv_t.c b/kernel/power/sgemv_t.c index 3d8a442dc..62c517a9d 100644 --- a/kernel/power/sgemv_t.c +++ b/kernel/power/sgemv_t.c @@ -24,6 +24,10 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +#if !defined(__VEC__) || !defined(__ALTIVEC__) +#include "../arm/gemv_t.c" + +#else #include "common.h" @@ -477,3 +481,4 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO } +#endif diff --git a/kernel/power/shgemm_kernel_power10.c b/kernel/power/shgemm_kernel_power10.c new file mode 100644 index 000000000..1ae9e04bf --- /dev/null +++ b/kernel/power/shgemm_kernel_power10.c @@ -0,0 +1,1044 @@ +/********************************************************************************* +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ +#include "common.h" +#include +#if defined(HALF) && defined(HALFCONVERSION) +static float +bfloat16tof32 (bfloat16 f16) +{ + float result = 0; + unsigned short *q = (unsigned short *) (&result); +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + q[0] = f16; +#else + q[1] = f16; +#endif + return result; +} + +#define BF16TOF32(x) (bfloat16tof32(x)) +#else +#define BF16TOF32(x) x +#endif + +typedef __vector unsigned char vec_t; +typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); +typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); + +vector char mask = + { 0x0, 0x1, 0x8, 0x9, 0x2, 0x3, 0xa, 0xb, 0x4, 0x5, 0xc, 0xd, 0x6, 0x7, 0xe, + 0xf +}; + +/* + * BFLOAT16 xvbf16ger2pp instruction needs 4×2 matrix of + * bfloat16 floating-point values as input. Hence this + * merging is needed on A and B matrices. + */ +#define MERGE_ROW(x) vec_perm(x, x, mask) +#define MERGE_HIGH(x, y) (vec_t) vec_mergeh ((vector short)x, (vector short)y) +#define MERGE_LOW(x, y) (vec_t) vec_mergel ((vector short)x, (vector short)y) + +#define SAVE_ACC(ACC, J) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] += result[0] * alpha; \ + rowC = (v4sf_t *) &CO[1*ldc+J]; \ + rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[2*ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v4sf_t *) &CO[3*ldc+J]; \ + rowC[0] += result[3] * alpha; +#define SAVE_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rowC = (v4sf_t *) &CO[4* ldc+J]; \ + rowC[0] += result[0] * alpha; \ + rowC = (v4sf_t *) &CO[5*ldc+J]; \ + rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v4sf_t *) &CO[7*ldc+J]; \ + rowC[0] += result[3] * alpha; +#define SAVE4x2_ACC(ACC, J) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rowC = (v2sf_t *) &CO[0* ldc+J]; \ + rowC[0] += result[0] * alpha; \ + rowC = (v2sf_t *) &CO[1* ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v2sf_t *) &CO[2* ldc+J]; \ + rowC[0] += result[4] * alpha; \ + rowC = (v2sf_t *) &CO[3* ldc+J]; \ + rowC[0] += result[6] * alpha; +#define SAVE4x2_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rowC = (v2sf_t *) &CO[4* ldc+J]; \ + rowC[0] += result[0] * alpha; \ + rowC = (v2sf_t *) &CO[5* ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v2sf_t *) &CO[6* ldc+J]; \ + rowC[0] += result[4] * alpha; \ + rowC = (v2sf_t *) &CO[7* ldc+J]; \ + rowC[0] += result[6] * alpha; + +#define MMA __builtin_mma_xvbf16ger2pp + +#define SAVE2x4_ACC(ACC, J) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] += result[0] * alpha; \ + rowC = (v4sf_t *) &CO[1* ldc+J]; \ + rowC[0] += result[1] * alpha; + +#define SET_ACC_ZERO4() \ + __builtin_mma_xxsetaccz (&acc0); \ + __builtin_mma_xxsetaccz (&acc1); \ + __builtin_mma_xxsetaccz (&acc2); \ + __builtin_mma_xxsetaccz (&acc3); + +#define SET_ACC_ZERO8() \ + __builtin_mma_xxsetaccz (&acc0); \ + __builtin_mma_xxsetaccz (&acc1); \ + __builtin_mma_xxsetaccz (&acc2); \ + __builtin_mma_xxsetaccz (&acc3); \ + __builtin_mma_xxsetaccz (&acc4); \ + __builtin_mma_xxsetaccz (&acc5); \ + __builtin_mma_xxsetaccz (&acc6); \ + __builtin_mma_xxsetaccz (&acc7); + +#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); +/************************************************************************************* +* SHGEMM Kernel +*************************************************************************************/ +int +CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, + IFLOAT * B, FLOAT * C, BLASLONG ldc) +{ + BLASLONG N = n; + BLASLONG i1; + v4sf_t valpha = { alpha, alpha, alpha, alpha }; + vector short vzero = { 0, 0, 0, 0, 0, 0, 0, 0 }; + N = n >> 3; + /* Loop for n >= 8. */ + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i, j; + FLOAT *CO; + IFLOAT *AO; + CO = C; + C += ldc << 3; + AO = A; + PREFETCH1 (A, 128); + PREFETCH1 (A, 256); + i = m >> 4; + /* Loop for m >= 16. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + SET_ACC_ZERO8 (); + BLASLONG l = 0; + for (l = 0; l < k / 2; l++) + { + vec_t *rowA = (vec_t *) & (AO[l << 5]); + vec_t *rowB = (vec_t *) & (BO[l << 4]); + vec_t rowB_h = MERGE_HIGH (rowB[0], rowB[1]); + vec_t rowB_l = MERGE_LOW (rowB[0], rowB[1]); + vec_t rowA_h = MERGE_HIGH (rowA[0], rowA[2]); + vec_t rowA_l = MERGE_LOW (rowA[0], rowA[2]); + vec_t rowA2_h = MERGE_HIGH (rowA[1], rowA[3]); + vec_t rowA2_l = MERGE_LOW (rowA[1], rowA[3]); + MMA (&acc0, rowB_h, rowA_h); + MMA (&acc1, rowB_l, rowA_h); + MMA (&acc2, rowB_h, rowA_l); + MMA (&acc3, rowB_l, rowA_l); + MMA (&acc4, rowB_h, rowA2_h); + MMA (&acc5, rowB_l, rowA2_h); + MMA (&acc6, rowB_h, rowA2_l); + MMA (&acc7, rowB_l, rowA2_l); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 4; + vec_t *rowA = (vec_t *) & (AO[l << 1]); + vec_t *rowB = (vec_t *) & (BO[l]); + vec_t rowB_h = MERGE_HIGH (rowB[0], rowB[1]); + vec_t rowB_l = MERGE_LOW (rowB[0], rowB[1]); + vec_t rowA_h = MERGE_HIGH (rowA[0], vzero); + vec_t rowA_l = MERGE_LOW (rowA[0], vzero); + vec_t rowA2_h = MERGE_HIGH (rowA[1], vzero); + vec_t rowA2_l = MERGE_LOW (rowA[1], vzero); + MMA (&acc0, rowB_h, rowA_h); + MMA (&acc1, rowB_l, rowA_h); + MMA (&acc2, rowB_h, rowA_l); + MMA (&acc3, rowB_l, rowA_l); + MMA (&acc4, rowB_h, rowA2_h); + MMA (&acc5, rowB_l, rowA2_h); + MMA (&acc6, rowB_h, rowA2_l); + MMA (&acc7, rowB_l, rowA2_l); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc2, 4); + SAVE_ACC1 (&acc1, 0); + SAVE_ACC1 (&acc3, 4); + SAVE_ACC (&acc4, 8); + SAVE_ACC (&acc6, 12); + SAVE_ACC1 (&acc5, 8); + SAVE_ACC1 (&acc7, 12); + CO += 16; + + AO += (k << 4); + BO += (k << 3); + } + i = (m & 15) >> 3; + /* Loop for m >= 8. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3; + SET_ACC_ZERO4 (); + BLASLONG l = 0; + for (l = 0; l < k / 2; l++) + { + vec_t *rowA = (vec_t *) & (AO[l << 4]); + vec_t *rowB = (vec_t *) & (BO[l << 4]); + vec_t rowB_h = MERGE_HIGH (rowB[0], rowB[1]); + vec_t rowB_l = MERGE_LOW (rowB[0], rowB[1]); + vec_t rowA_h = MERGE_HIGH (rowA[0], rowA[1]); + vec_t rowA_l = MERGE_LOW (rowA[0], rowA[1]); + MMA (&acc0, rowB_h, rowA_h); + MMA (&acc1, rowB_l, rowA_h); + MMA (&acc2, rowB_h, rowA_l); + MMA (&acc3, rowB_l, rowA_l); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 4; + vec_t *rowA = (vec_t *) & (AO[l]); + vec_t *rowB = (vec_t *) & (BO[l]); + vec_t rowB_h = MERGE_HIGH (rowB[0], rowB[1]); + vec_t rowB_l = MERGE_LOW (rowB[0], rowB[1]); + vec_t rowA_h = MERGE_HIGH (rowA[0], vzero); + vec_t rowA_l = MERGE_LOW (rowA[0], vzero); + MMA (&acc0, rowB_h, rowA_h); + MMA (&acc1, rowB_l, rowA_h); + MMA (&acc2, rowB_h, rowA_l); + MMA (&acc3, rowB_l, rowA_l); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc2, 4); + SAVE_ACC1 (&acc1, 0); + SAVE_ACC1 (&acc3, 4); + CO += 8; + AO += (k << 3); + BO += (k << 3); + } + i = (m & 7) >> 2; + /* Loop for m >= 4. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + BLASLONG l = 0; + for (l = 0; l < k / 2; l++) + { + vec_t *rowA = (vec_t *) & (AO[l << 3]); + vec_t *rowB = (vec_t *) & (BO[l << 4]); + vec_t rowA_mrg = MERGE_ROW (rowA[0]); + MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), rowA_mrg); + MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), rowA_mrg); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 3; + vector short rowA = + { AO[l + 0], 0, AO[l + 1], 0, AO[l + 2], 0, AO[l + 3], 0 }; + vec_t *rowB = (vec_t *) & (BO[l << 1]); + MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), (vec_t) rowA); + MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), (vec_t) rowA); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC1 (&acc1, 0); + CO += 4; + AO += (k << 2); + BO += (k << 3); + } + i = (m & 3) >> 1; + /* Loop for m >= 2. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v2sf_t *rowC; + v2sf_t result[8]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + BLASLONG l = 0; + for (l = 0; l < k / 2; l++) + { + vector short rowA = + { AO[(l << 2) + 0], AO[(l << 2) + 2], AO[(l << 2) + 1], + AO[(l << 2) + 3], + 0, 0, 0, 0 + }; + vec_t *rowB = (vec_t *) & (BO[l << 4]); + MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), (vec_t) rowA); + MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), (vec_t) rowA); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 2; + vector short rowA = { AO[l + 0], 0, AO[l + 1], 0, 0, 0, 0, 0 }; + vec_t *rowB = (vec_t *) & (BO[(l << 2)]); + MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), (vec_t) rowA); + MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), (vec_t) rowA); + } + SAVE4x2_ACC (&acc0, 0); + SAVE4x2_ACC1 (&acc1, 0); + CO += 2; + AO += (k << 1); + BO += (k << 3); + } + i = (m & 1) >> 0; + /* Loop for m = 1. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 } + , t1 = + { + 0, 0, 0, 0}; + for (l = 0; l < k; l++) + { + v4sf_t rowA = + { BF16TOF32 (AO[l]), BF16TOF32 (AO[l]), BF16TOF32 (AO[l]), + BF16TOF32 (AO[l]) + }; + v4sf_t rowB = + { BF16TOF32 (BO[l << 3]), BF16TOF32 (BO[(l << 3) + 1]), + BF16TOF32 (BO[(l << 3) + 2]), + BF16TOF32 (BO[(l << 3) + 3]) + }; + v4sf_t rowB1 = + { BF16TOF32 (BO[(l << 3) + 4]), BF16TOF32 (BO[(l << 3) + 5]), + BF16TOF32 (BO[(l << 3) + 6]), + BF16TOF32 (BO[(l << 3) + 7]) + }; + t += rowA * rowB; + t1 += rowA * rowB1; + } + t = t * valpha; + t1 = t1 * valpha; + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; + CO[2 * ldc] += t[2]; + CO[3 * ldc] += t[3]; + CO[4 * ldc] += t1[0]; + CO[5 * ldc] += t1[1]; + CO[6 * ldc] += t1[2]; + CO[7 * ldc] += t1[3]; + CO += 1; + AO += k; + BO += (k << 3); + } + B += k << 3; + } + N = (n & 7) >> 2; + /* Loop for n >= 4. */ + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i, j; + FLOAT *CO; + IFLOAT *AO; + CO = C; + C += ldc << 2; + AO = A; + i = m >> 5; + /* Loop for m >= 32. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + IFLOAT *A1 = AO + (16 * k); + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + SET_ACC_ZERO8 (); + BLASLONG l = 0; + for (l = 0; l < k / 2; l++) + { + vec_t *rowA = (vec_t *) & (AO[l << 5]); + vec_t *rowA1 = (vec_t *) & (A1[l << 5]); + vec_t *rowB = (vec_t *) & (BO[l << 3]); + vec_t rowB_mrg = MERGE_ROW (rowB[0]); + MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], rowA[2])); + MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], rowA[2])); + MMA (&acc2, rowB_mrg, MERGE_HIGH (rowA[1], rowA[3])); + MMA (&acc3, rowB_mrg, MERGE_LOW (rowA[1], rowA[3])); + MMA (&acc4, rowB_mrg, MERGE_HIGH (rowA1[0], rowA1[2])); + MMA (&acc5, rowB_mrg, MERGE_LOW (rowA1[0], rowA1[2])); + MMA (&acc6, rowB_mrg, MERGE_HIGH (rowA1[1], rowA1[3])); + MMA (&acc7, rowB_mrg, MERGE_LOW (rowA1[1], rowA1[3])); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 3; + vec_t *rowA = (vec_t *) & (AO[(l << 2)]); + vec_t *rowA1 = (vec_t *) & (A1[(l << 2)]); + vec_t *rowB = (vec_t *) & (BO[l]); + vec_t rowB_mrg = MERGE_ROW (rowB[0]); + MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], vzero)); + MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], vzero)); + MMA (&acc2, rowB_mrg, MERGE_HIGH (rowA[1], vzero)); + MMA (&acc3, rowB_mrg, MERGE_LOW (rowA[1], vzero)); + MMA (&acc4, rowB_mrg, MERGE_HIGH (rowA1[0], vzero)); + MMA (&acc5, rowB_mrg, MERGE_LOW (rowA1[0], vzero)); + MMA (&acc6, rowB_mrg, MERGE_HIGH (rowA1[1], vzero)); + MMA (&acc7, rowB_mrg, MERGE_LOW (rowA1[1], vzero)); + } + + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc1, 4); + CO += 8; + SAVE_ACC (&acc2, 0); + SAVE_ACC (&acc3, 4); + CO += 8; + SAVE_ACC (&acc4, 0); + SAVE_ACC (&acc5, 4); + CO += 8; + SAVE_ACC (&acc6, 0); + SAVE_ACC (&acc7, 4); + CO += 8; + AO += k << 5; + BO += k << 2; + } + i = (m & 31) >> 4; + /* Loop for m >= 16. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3; + SET_ACC_ZERO4 (); + BLASLONG l = 0; + for (l = 0; l < k / 2; l++) + { + vec_t *rowA = (vec_t *) & (AO[l << 5]); + vec_t *rowB = (vec_t *) & (BO[l << 3]); + vec_t rowB_mrg = MERGE_ROW (rowB[0]); + MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], rowA[2])); + MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], rowA[2])); + MMA (&acc2, rowB_mrg, MERGE_HIGH (rowA[1], rowA[3])); + MMA (&acc3, rowB_mrg, MERGE_LOW (rowA[1], rowA[3])); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 3; + vec_t *rowA = (vec_t *) & (AO[(l << 2)]); + vec_t *rowB = (vec_t *) & (BO[l]); + vec_t rowB_mrg = MERGE_ROW (rowB[0]); + MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], vzero)); + MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], vzero)); + MMA (&acc2, rowB_mrg, MERGE_HIGH (rowA[1], vzero)); + MMA (&acc3, rowB_mrg, MERGE_LOW (rowA[1], vzero)); + } + + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc1, 4); + CO += 8; + SAVE_ACC (&acc2, 0); + SAVE_ACC (&acc3, 4); + CO += 8; + AO += k << 4; + BO += k << 2; + } + i = (m & 15) >> 3; + /* Loop for m >= 8. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + BLASLONG l = 0; + for (l = 0; l < k / 2; l++) + { + vec_t *rowA = (vec_t *) & (AO[l << 4]); + vec_t *rowB = (vec_t *) & (BO[l << 3]); + vec_t rowB_mrg = MERGE_ROW (rowB[0]); + MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], rowA[1])); + MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], rowA[1])); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 3; + vec_t *rowA = (vec_t *) & (AO[l << 1]); + vec_t *rowB = (vec_t *) & (BO[l]); + vec_t rowB_mrg = MERGE_ROW (rowB[0]); + MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], vzero)); + MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], vzero)); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc1, 4); + CO += 8; + AO += k << 3; + BO += k << 2; + } + i = (m & 7) >> 2; + /* Loop for m >= 4. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v4sf_t *rowC; + __vector_quad acc0; + v4sf_t result[4]; + BLASLONG l = 0; + __builtin_mma_xxsetaccz (&acc0); + for (l = 0; l < k / 2; l++) + { + vec_t *rowA = (vec_t *) & (AO[l << 3]); + vec_t *rowB = (vec_t *) & (BO[l << 3]); + MMA (&acc0, MERGE_ROW (rowB[0]), MERGE_ROW (rowA[0])); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 3; + vector short rowA = + { AO[l], 0, AO[l + 1], 0, AO[l + 2], 0, AO[l + 3], 0 }; + vec_t *rowB = (vec_t *) & (BO[l]); + MMA (&acc0, MERGE_ROW (rowB[0]), (vec_t) rowA); + } + SAVE_ACC (&acc0, 0); + CO += 4; + AO += k << 2; + BO += k << 2; + } + i = (m & 3) >> 1; + /* Loop for m >= 2. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v2sf_t *rowC; + v2sf_t result[8]; + __vector_quad acc0; + BLASLONG l = 0; + __builtin_mma_xxsetaccz (&acc0); + for (l = 0; l < k / 2; l++) + { + vector short rowA = + { AO[(l << 2) + 0], AO[(l << 2) + 2], AO[(l << 2) + 1], + AO[(l << 2) + 3], + 0, 0, 0, 0 + }; + vec_t *rowB = (vec_t *) & (BO[l << 3]); + MMA (&acc0, MERGE_ROW (rowB[0]), (vec_t) rowA); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 2; + vector short rowA = { AO[l], 0, AO[l + 1], 0, 0, 0, 0, 0 }; + vec_t *rowB = (vec_t *) & (BO[l << 1]); + MMA (&acc0, MERGE_ROW (rowB[0]), (vec_t) rowA); + } + SAVE4x2_ACC (&acc0, 0); + CO += 2; + AO += k << 1; + BO += k << 2; + } + i = (m & 1) >> 0; + /* Loop for m = 1. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; + for (l = 0; l < k; l++) + { + v4sf_t rowA = + { BF16TOF32 (AO[l]), BF16TOF32 (AO[l]), BF16TOF32 (AO[l]), + BF16TOF32 (AO[l]) + }; + v4sf_t rowB = + { BF16TOF32 (BO[l << 2]), BF16TOF32 (BO[(l << 2) + 1]), + BF16TOF32 (BO[(l << 2) + 2]), + BF16TOF32 (BO[(l << 2) + 3]) + }; + t += rowA * rowB; + } + t = t * valpha; + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; + CO[2 * ldc] += t[2]; + CO[3 * ldc] += t[3]; + AO += k; + BO += (k << 2); + CO += 1; + } + + B += k << 2; + } + N = (n & 3) >> 1; + /* Loop for n >= 2. */ + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i, j; + FLOAT *CO; + IFLOAT *AO; + CO = C; + C += ldc << 1; + AO = A; + i = m >> 5; + /* Loop for m >= 32. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + IFLOAT *A1 = AO + (16 * k); + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + SET_ACC_ZERO8 (); + BLASLONG l = 0; + for (l = 0; l < k / 2; l++) + { + vector short rowB = + { BO[(l << 2) + 0], BO[(l << 2) + 2], BO[(l << 2) + 1], + BO[(l << 2) + 3], + 0, 0, 0, 0 + }; + vec_t *rowA = (vec_t *) & (AO[l << 5]); + vec_t *rowA1 = (vec_t *) & (A1[l << 5]); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2])); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2])); + MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3])); + MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3])); + MMA (&acc4, (vec_t) rowB, MERGE_HIGH (rowA1[0], rowA1[2])); + MMA (&acc5, (vec_t) rowB, MERGE_LOW (rowA1[0], rowA1[2])); + MMA (&acc6, (vec_t) rowB, MERGE_HIGH (rowA1[1], rowA1[3])); + MMA (&acc7, (vec_t) rowB, MERGE_LOW (rowA1[1], rowA1[3])); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 2; + vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 }; + vec_t *rowA = (vec_t *) & (AO[l << 3]); + vec_t *rowA1 = (vec_t *) & (A1[l << 3]); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2])); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2])); + MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3])); + MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3])); + MMA (&acc4, (vec_t) rowB, MERGE_HIGH (rowA1[0], rowA1[2])); + MMA (&acc5, (vec_t) rowB, MERGE_LOW (rowA1[0], rowA1[2])); + MMA (&acc6, (vec_t) rowB, MERGE_HIGH (rowA1[1], rowA1[3])); + MMA (&acc7, (vec_t) rowB, MERGE_LOW (rowA1[1], rowA1[3])); + } + SAVE2x4_ACC (&acc0, 0); + SAVE2x4_ACC (&acc1, 4); + SAVE2x4_ACC (&acc2, 8); + SAVE2x4_ACC (&acc3, 12); + CO += 16; + SAVE2x4_ACC (&acc4, 0); + SAVE2x4_ACC (&acc5, 4); + SAVE2x4_ACC (&acc6, 8); + SAVE2x4_ACC (&acc7, 12); + CO += 16; + AO += k << 5; + BO += k << 1; + } + i = (m & 31) >> 4; + /* Loop for m >= 16. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3; + SET_ACC_ZERO4 (); + BLASLONG l = 0; + for (l = 0; l < k / 2; l++) + { + vector short rowB = + { BO[(l << 2) + 0], BO[(l << 2) + 2], BO[(l << 2) + 1], + BO[(l << 2) + 3], + 0, 0, 0, 0 + }; + vec_t *rowA = (vec_t *) & (AO[l << 5]); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2])); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2])); + MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3])); + MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3])); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 2; + vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 }; + vec_t *rowA = (vec_t *) & (AO[l << 3]); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2])); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2])); + MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3])); + MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3])); + } + SAVE2x4_ACC (&acc0, 0); + SAVE2x4_ACC (&acc1, 4); + SAVE2x4_ACC (&acc2, 8); + SAVE2x4_ACC (&acc3, 12); + CO += 16; + AO += k << 4; + BO += k << 1; + } + i = (m & 15) >> 3; + /* Loop for m >= 8. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + BLASLONG l = 0; + for (l = 0; l < k / 2; l++) + { + vector short rowB = + { BO[(l << 2) + 0], BO[(l << 2) + 2], BO[(l << 2) + 1], + BO[(l << 2) + 3], + 0, 0, 0, 0 + }; + vec_t *rowA = (vec_t *) & (AO[l << 4]); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[1])); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[1])); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 2; + vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 }; + vec_t *rowA = (vec_t *) & (AO[(l << 2)]); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[1])); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[1])); + } + SAVE2x4_ACC (&acc0, 0); + SAVE2x4_ACC (&acc1, 4); + CO += 8; + AO += k << 3; + BO += k << 1; + } + i = (m & 7) >> 2; + /* Loop for m >= 4. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0; + __builtin_mma_xxsetaccz (&acc0); + BLASLONG l = 0; + for (l = 0; l < k / 2; l++) + { + vector short rowB = + { BO[(l << 2) + 0], BO[(l << 2) + 2], BO[(l << 2) + 1], + BO[(l << 2) + 3], + 0, 0, 0, 0 + }; + vec_t *rowA = (vec_t *) & (AO[l << 3]); + MMA (&acc0, (vec_t) rowB, MERGE_ROW (rowA[0])); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 2; + vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 }; + vec_t *rowA = (vec_t *) & (AO[l << 1]); + MMA (&acc0, (vec_t) rowB, MERGE_ROW (rowA[0])); + } + SAVE2x4_ACC (&acc0, 0); + CO += 4; + AO += k << 2; + BO += k << 1; + } + i = (m & 3) >> 1; + /* Loop for m >= 2. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; + for (l = 0; l < (k << 1); l += 2) + { + v4sf_t rowA = + { BF16TOF32 (AO[l]), BF16TOF32 (AO[l]), BF16TOF32 (AO[l + 1]), + BF16TOF32 (AO[l + 1]) + }; + v4sf_t rowB = + { BF16TOF32 (BO[l]), BF16TOF32 (BO[l + 1]), BF16TOF32 (BO[l]), + BF16TOF32 (BO[l + 1]) + }; + t += rowA * rowB; + } + t = t * valpha; + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; + CO[0 * ldc + 1] += t[2]; + CO[1 * ldc + 1] += t[3]; + CO += 2; + AO += k << 1; + BO += k << 1; + } + i = (m & 1) >> 0; + /* Loop for m = 1. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; + for (l = 0; l < k; l++) + { + v4sf_t rowA = { BF16TOF32 (AO[l]), BF16TOF32 (AO[l]), 0, 0 }; + v4sf_t rowB = + { BF16TOF32 (BO[l << 1]), BF16TOF32 (BO[(l << 1) + 1]), 0, + 0 + }; + t += rowA * rowB; + } + CO[0 * ldc] += t[0] * alpha; + CO[1 * ldc] += t[1] * alpha; + CO += 1; + AO += k; + BO += k << 1; + } + B += k << 1; + } + N = (n & 1) >> 0; + /* Loop for n = 1. */ + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i; + FLOAT *CO; + IFLOAT *AO; + CO = C; + C += ldc; + AO = A; + i = m; + /* Loop for m >= 16. */ + while (i >= 16) + { + IFLOAT *BO = B; + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; + v4sf_t t1 = { 0, 0, 0, 0 }; + v4sf_t t2 = { 0, 0, 0, 0 }; + v4sf_t t3 = { 0, 0, 0, 0 }; + for (l = 0; l < k; l++) + { + v4sf_t rowB = + { BF16TOF32 (BO[l]), BF16TOF32 (BO[l]), BF16TOF32 (BO[l]), + BF16TOF32 (BO[l]) + }; + v4sf_t rowA = + { BF16TOF32 (AO[l << 4]), BF16TOF32 (AO[(l << 4) + 1]), + BF16TOF32 (AO[(l << 4) + 2]), + BF16TOF32 (AO[(l << 4) + 3]) + }; + v4sf_t rowA1 = + { BF16TOF32 (AO[(l << 4) + 4]), BF16TOF32 (AO[(l << 4) + 5]), + BF16TOF32 (AO[(l << 4) + 6]), + BF16TOF32 (AO[(l << 4) + 7]) + }; + v4sf_t rowA2 = + { BF16TOF32 (AO[(l << 4) + 8]), BF16TOF32 (AO[(l << 4) + 9]), + BF16TOF32 (AO[(l << 4) + 10]), + BF16TOF32 (AO[(l << 4) + 11]) + }; + v4sf_t rowA3 = { BF16TOF32 (AO[(l << 4) + 12]), + BF16TOF32 (AO[(l << 4) + 13]), BF16TOF32 (AO[(l << 4) + 14]), + BF16TOF32 (AO[(l << 4) + 15]) + }; + t += rowA * rowB; + t1 += rowA1 * rowB; + t2 += rowA2 * rowB; + t3 += rowA3 * rowB; + } + t = t * valpha; + t1 = t1 * valpha; + t2 = t2 * valpha; + t3 = t3 * valpha; + CO[0] += t[0]; + CO[1] += t[1]; + CO[2] += t[2]; + CO[3] += t[3]; + CO[4] += t1[0]; + CO[5] += t1[1]; + CO[6] += t1[2]; + CO[7] += t1[3]; + CO[8] += t2[0]; + CO[9] += t2[1]; + CO[10] += t2[2]; + CO[11] += t2[3]; + CO[12] += t3[0]; + CO[13] += t3[1]; + CO[14] += t3[2]; + CO[15] += t3[3]; + AO += k << 4; + BO += k; + CO += 16; + i -= 16; + } + /* Loop for m >= 8. */ + while (i >= 8) + { + IFLOAT *BO = B; + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; + v4sf_t t1 = { 0, 0, 0, 0 }; + for (l = 0; l < k; l++) + { + v4sf_t rowB = + { BF16TOF32 (BO[l]), BF16TOF32 (BO[l]), BF16TOF32 (BO[l]), + BF16TOF32 (BO[l]) + }; + v4sf_t rowA = + { BF16TOF32 (AO[l << 3]), BF16TOF32 (AO[(l << 3) + 1]), + BF16TOF32 (AO[(l << 3) + 2]), + BF16TOF32 (AO[(l << 3) + 3]) + }; + v4sf_t rowA1 = + { BF16TOF32 (AO[(l << 3) + 4]), BF16TOF32 (AO[(l << 3) + 5]), + BF16TOF32 (AO[(l << 3) + 6]), + BF16TOF32 (AO[(l << 3) + 7]) + }; + t += rowA * rowB; + t1 += rowA1 * rowB; + } + t = t * valpha; + t1 = t1 * valpha; + CO[0] += t[0]; + CO[1] += t[1]; + CO[2] += t[2]; + CO[3] += t[3]; + CO[4] += t1[0]; + CO[5] += t1[1]; + CO[6] += t1[2]; + CO[7] += t1[3]; + AO += k << 3; + BO += k; + CO += 8; + i -= 8; + } + /* Loop for m >= 4. */ + while (i >= 4) + { + IFLOAT *BO = B; + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; + for (l = 0; l < k; l++) + { + v4sf_t rowB = + { BF16TOF32 (BO[l]), BF16TOF32 (BO[l]), BF16TOF32 (BO[l]), + BF16TOF32 (BO[l]) + }; + v4sf_t rowA = + { BF16TOF32 (AO[l << 2]), BF16TOF32 (AO[(l << 2) + 1]), + BF16TOF32 (AO[(l << 2) + 2]), + BF16TOF32 (AO[(l << 2) + 3]) + }; + t += rowA * rowB; + } + t = t * valpha; + CO[0] += t[0]; + CO[1] += t[1]; + CO[2] += t[2]; + CO[3] += t[3]; + AO += k << 2; + BO += k; + CO += 4; + i -= 4; + } + /* Loop for m >= 2. */ + while (i >= 2) + { + IFLOAT *BO = B; + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; + for (l = 0; l < k; l++) + { + v4sf_t rowB = { BF16TOF32 (BO[l]), BF16TOF32 (BO[l]), 0, 0 }; + v4sf_t rowA = + { BF16TOF32 (AO[l << 1]), BF16TOF32 (AO[(l << 1) + 1]), 0, + 0 + }; + t += rowA * rowB; + } + t = t * valpha; + CO[0] += t[0]; + CO[1] += t[1]; + AO += k << 1; + BO += k; + CO += 2; + i -= 2; + } + /* Loop for m = 1. */ + while (i >= 1) + { + IFLOAT *BO = B; + BLASLONG l = 0; + FLOAT t = 0; + for (l = 0; l < k; l++) + { + t += BF16TOF32 (AO[l]) * BF16TOF32 (BO[l]); + } + AO += k; + BO += k; + CO[0] += t * alpha; + CO += 1; + i -= 1; + } + + B += k; + } + + return 0; +} diff --git a/kernel/power/srot.c b/kernel/power/srot.c index 6af813c16..a53342f61 100644 --- a/kernel/power/srot.c +++ b/kernel/power/srot.c @@ -39,9 +39,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma GCC optimize "O1" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "srot_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_16 diff --git a/kernel/power/sscal.c b/kernel/power/sscal.c index 4f3ba5698..de37e10a5 100644 --- a/kernel/power/sscal.c +++ b/kernel/power/sscal.c @@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "sscal_microk_power8.c" #endif +#endif #if !defined(HAVE_KERNEL_16) diff --git a/kernel/power/sswap.c b/kernel/power/sswap.c index 23d13280f..44522f0a0 100644 --- a/kernel/power/sswap.c +++ b/kernel/power/sswap.c @@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "sswap_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_32 diff --git a/kernel/power/zasum.c b/kernel/power/zasum.c index f61c62e75..305e50ede 100644 --- a/kernel/power/zasum.c +++ b/kernel/power/zasum.c @@ -46,9 +46,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "zasum_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_8 diff --git a/kernel/power/zaxpy.c b/kernel/power/zaxpy.c index f0f8c6910..3064d5435 100644 --- a/kernel/power/zaxpy.c +++ b/kernel/power/zaxpy.c @@ -36,9 +36,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "zaxpy_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_4 diff --git a/kernel/power/zcopy.c b/kernel/power/zcopy.c index b21d6ef15..453f4e551 100644 --- a/kernel/power/zcopy.c +++ b/kernel/power/zcopy.c @@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "zcopy_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_16 diff --git a/kernel/power/zdot.c b/kernel/power/zdot.c index fd36c7f44..fe0e9284e 100644 --- a/kernel/power/zdot.c +++ b/kernel/power/zdot.c @@ -36,9 +36,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "zdot_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_8 @@ -93,9 +95,11 @@ FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG in FLOAT dot[4] = { 0.0, 0.0, 0.0 , 0.0 } ; if ( n <= 0 ) - { + { /* __real__ result = 0.0 ; __imag__ result = 0.0 ; + */ + result = OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0); return(result); } @@ -149,11 +153,17 @@ FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG in } #if !defined(CONJ) + /* __real__ result = dot[0] - dot[1]; __imag__ result = dot[2] + dot[3]; + */ + result = OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]-dot[1],dot[2]+dot[3]); #else + /* __real__ result = dot[0] + dot[1]; __imag__ result = dot[2] - dot[3]; + */ + result = OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]+dot[1],dot[2]-dot[3]); #endif diff --git a/kernel/power/zgemm_kernel_power10.S b/kernel/power/zgemm_kernel_power10.S new file mode 100644 index 000000000..fca389e69 --- /dev/null +++ b/kernel/power/zgemm_kernel_power10.S @@ -0,0 +1,245 @@ +/*************************************************************************** +Copyright (c) 2013-2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#define LOAD ld + +#define STACKSIZE 512 + +#define FZERO 312+192(SP) + +#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ + +#define M r3 +#define N r4 +#define K r5 + + +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 + + + +#define o0 0 +#define alpha_r vs62 +#define alpha_i vs63 + +#define VECSAVE r11 + +#define FRAMEPOINTER r12 + +#define T10 r14 + +#define L r15 +#define T8 r16 +#define T5 r17 +#define T2 r19 +#define TEMP_REG r20 +#define T6 r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define T7 r27 +#define T3 r28 +#define T4 r29 + +#define PRE r30 +#define T1 r31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + mr FRAMEPOINTER, SP + addi SP, SP, -STACKSIZE + mflr r0 + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + xxspltd alpha_r,vs1,0 /*copy from register f1 */ + xxspltd alpha_i,vs2,0 /*copy from register f2 */ + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + + stxv vs20, 288(SP) + stxv vs21, 304(SP) + stxv vs22, 320(SP) + stxv vs23, 336(SP) + stxv vs24, 352(SP) + stxv vs25, 368(SP) + stxv vs26, 384(SP) + stxv vs27, 400(SP) + stxv vs28, 416(SP) + stxv vs29, 432(SP) + stxv vs30, 448(SP) + stxv vs31, 464(SP) + + std r0, FLINK_SAVE(SP) + + +#if defined(linux) || defined(__FreeBSD__) + ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) +#endif + + +#ifdef TRMMKERNEL +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) + ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) +#endif +#endif + + +#include "zgemm_macros_power10.S" + + + + slwi LDC, LDC, ZBASE_SHIFT + li PRE, 512 + li r0, 0 + + +#if defined(CC) || defined(CR) || defined(RC) || defined(RR) +/*negate for this case as we will use addition -1*(a+b) */ + xvnegdp alpha_r,alpha_r + xvnegdp alpha_i,alpha_i +#endif + .align 4 + +#include "zgemm_logic_power10.S" + +L999: + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + ld r0, FLINK_SAVE(SP) + + lxv vs20, 288(SP) + lxv vs21, 304(SP) + lxv vs22, 320(SP) + lxv vs23, 336(SP) + lxv vs24, 352(SP) + lxv vs25, 368(SP) + lxv vs26, 384(SP) + lxv vs27, 400(SP) + mtlr r0 + lxv vs28, 416(SP) + lxv vs29, 432(SP) + lxv vs30, 448(SP) + lxv vs31, 464(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE +#endif diff --git a/kernel/power/zgemm_logic_power10.S b/kernel/power/zgemm_logic_power10.S new file mode 100644 index 000000000..1143733e0 --- /dev/null +++ b/kernel/power/zgemm_logic_power10.S @@ -0,0 +1,1735 @@ +/*************************************************************************** +Copyright (c) 2013-2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define MY_ALIGN .align 3 +b ZGEMM_L2 +/* MINI SUBROUTINES */ +/* 2x8 MAIN 128x+2 LOOP */ + + +ZGEMM_L2x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + MY_ALIGN +ZGEMM_L2x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_2 0, 0 +ZGEMM_L2x8_K128: +/*----------------------------------------*/ + KERNEL2x8_2 1, 0 + dcbt AO, T2 + KERNEL2x8_2 2, 0 + KERNEL2x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_2 4, 0 + KERNEL2x8_2 5, 0 + dcbt AO, T4 + KERNEL2x8_2 6, 0 + KERNEL2x8_2 7, 0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_2 8, 0 + KERNEL2x8_2 9, 0 + KERNEL2x8_2 10, 0 + KERNEL2x8_2 11, 0 + dcbt BO, T4 + KERNEL2x8_2 12, 0 + KERNEL2x8_2 13, 0 + KERNEL2x8_2 14, 0 + KERNEL2x8_2 15, 0 + KERNEL2x8_2 16, 0 + KERNEL2x8_2 17, 0 + KERNEL2x8_2 18, 0 + KERNEL2x8_2 19, 0 + KERNEL2x8_2 20, 0 + KERNEL2x8_2 21, 0 + KERNEL2x8_2 22, 0 + KERNEL2x8_2 23, 0 + KERNEL2x8_2 24, 0 + KERNEL2x8_2 25, 0 + KERNEL2x8_2 26, 0 + KERNEL2x8_2 27, 0 + KERNEL2x8_2 28, 0 + KERNEL2x8_2 29, 0 + KERNEL2x8_2 30, 0 + KERNEL2x8_2 31, 0 + KERNEL2x8_2 32, 0 + KERNEL2x8_2 33, 0 + KERNEL2x8_2 34, 0 + KERNEL2x8_2 35, 0 + KERNEL2x8_2 36, 0 + KERNEL2x8_2 37, 0 + KERNEL2x8_2 38, 0 + KERNEL2x8_2 39, 0 + KERNEL2x8_2 40, 0 + KERNEL2x8_2 41, 0 + KERNEL2x8_2 42, 0 + KERNEL2x8_2 43, 0 + KERNEL2x8_2 44, 0 + KERNEL2x8_2 45, 0 + KERNEL2x8_2 46, 0 + KERNEL2x8_2 47, 0 + KERNEL2x8_2 48, 0 + KERNEL2x8_2 49, 0 + KERNEL2x8_2 50, 0 + KERNEL2x8_2 51, 0 + KERNEL2x8_2 52, 0 + KERNEL2x8_2 53, 0 + KERNEL2x8_2 54, 0 + KERNEL2x8_2 55, 0 + KERNEL2x8_2 56, 0 + KERNEL2x8_2 57, 0 + KERNEL2x8_2 58, 0 + KERNEL2x8_2 59, 0 + KERNEL2x8_2 60, 0 + KERNEL2x8_2 61, 0 + KERNEL2x8_2 62, 0 + KERNEL2x8_2 63, 1 + bdz ZGEMM_L2x8_LOOP_END + b ZGEMM_L2x8_LOOP + MY_ALIGN + +ZGEMM_L2x8_LOOP_END: +/*----------------------------------------*/ + KERNEL2x8_2 0, 1 + blr + MY_ALIGN + + +ZGEMM_2x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + MY_ALIGN +ZGEMM_L2x4_LOOP: +/*----------------------------------------*/ + KERNEL2x4_2 0, 0 +ZGEMM_L2x4_K32: +/*----------------------------------------*/ + KERNEL2x4_2 1, 0 + KERNEL2x4_2 2, 0 + KERNEL2x4_2 3, 0 + KERNEL2x4_2 4, 0 + KERNEL2x4_2 5, 0 + KERNEL2x4_2 6, 0 + KERNEL2x4_2 7, 0 + KERNEL2x4_2 8, 0 + KERNEL2x4_2 9, 0 + KERNEL2x4_2 10, 0 + KERNEL2x4_2 11, 0 + KERNEL2x4_2 12, 0 + KERNEL2x4_2 13, 0 + KERNEL2x4_2 14, 0 + KERNEL2x4_2 15, 1 + bdnz ZGEMM_L2x4_LOOP + MY_ALIGN +ZGEMM_L2x4_LOOP_END: +/*----------------------------------------*/ + KERNEL2x4_2 0, 1 + blr + MY_ALIGN + + +ZGEMM_2x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + MY_ALIGN +ZGEMM_L2x2_LOOP: +/*----------------------------------------*/ + KERNEL2x2_2 0, 0 +ZGEMM_L2x2_K32: +/*----------------------------------------*/ + KERNEL2x2_2 1, 0 + KERNEL2x2_2 2, 0 + KERNEL2x2_2 3, 0 + KERNEL2x2_2 4, 0 + KERNEL2x2_2 5, 0 + KERNEL2x2_2 6, 0 + KERNEL2x2_2 7, 0 + KERNEL2x2_2 8, 0 + KERNEL2x2_2 9, 0 + KERNEL2x2_2 10, 0 + KERNEL2x2_2 11, 0 + KERNEL2x2_2 12, 0 + KERNEL2x2_2 13, 0 + KERNEL2x2_2 14, 0 + KERNEL2x2_2 15, 1 + bdnz ZGEMM_L2x2_LOOP + MY_ALIGN + + +ZGEMM_L2x2_LOOP_END: +/*----------------------------------------*/ + KERNEL2x2_2 0, 1 + blr + MY_ALIGN + +ZGEMM_2x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x1_2 + MY_ALIGN +ZGEMM_L2x1_LOOP: +/*----------------------------------------*/ + KERNEL2x1_L2 32, 64, 0, 0 +ZGEMM_L2x1_K32: +/*----------------------------------------*/ + KERNEL2x1_L2 32, 64, 1, 0 + KERNEL2x1_L2 32, 64, 2, 0 + KERNEL2x1_L2 32, 64, 3, 0 + KERNEL2x1_L2 32, 64, 4, 0 + KERNEL2x1_L2 32, 64, 5, 0 + KERNEL2x1_L2 32, 64, 6, 0 + KERNEL2x1_L2 32, 64, 7, 0 + KERNEL2x1_L2 32, 64, 8, 0 + KERNEL2x1_L2 32, 64, 9, 0 + KERNEL2x1_L2 32, 64, 10, 0 + KERNEL2x1_L2 32, 64, 11, 0 + KERNEL2x1_L2 32, 64, 12, 0 + KERNEL2x1_L2 32, 64, 13, 0 + KERNEL2x1_L2 32, 64, 14, 0 + KERNEL2x1_L2 32, 64, 15, 1 + bdnz ZGEMM_L2x1_LOOP + MY_ALIGN +ZGEMM_L2x1_LOOP_END: +/*----------------------------------------*/ + END2x1_2 + blr + + MY_ALIGN + + +/* MAIN LOOP BEGINS */ + MY_ALIGN + + +ZGEMM_L2: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) && !defined(LEFT) + neg TEMP_REG, OFFSET +#endif + srawi. J, N, 1 + bgt ZGEMM_L2_BEGIN + b ZGEMM_L2_END + +ZGEMM_L2_BEGIN: +/*----------------------------------------*/ + mr CO, C + slwi T1, LDC, 1 + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + bgt ZGEMM_L2_BEGIN_CONTINUE + b ZGEMM_L2x8_END + +ZGEMM_L2_BEGIN_CONTINUE: + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +ZGEMM_L2x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 8, 2 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 8, 2 + mr T1, T6 +#else + mr T1, K +#endif +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /* T8 <- T1 % 128 */ + + KERNEL2x8_PRELOAD + KERNEL2x8_ZERO_AND_PRIME_MMA + ble ZGEMM_L2x8_SUB0 + bl ZGEMM_L2x8_LMAIN_SUB + andi. L, T1, 127 + + bgt ZGEMM_L2x8_BEGIN_CONTINUE + b ZGEMM_L2x8_SAVE + +ZGEMM_L2x8_BEGIN_CONTINUE: + b ZGEMM_L2x8_SUB2 + + +ZGEMM_L2x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6, 129 +#else + andi. L, K, 255 + cmpwi K, 129 +#endif + li T8, 1 + bne CMP2x8_128K + LOAD_END_2x8 128, 32 + KERNEL2x8_PRELOAD + addi BO, BO, -64 + addi AO,AO, -256 + mtctr T8 + bl ZGEMM_L2x8_K128 + b ZGEMM_L2x8_SAVE + +CMP2x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 128 +#else + cmpwi K, 128 +#endif + bne ZGEMM_L2x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -64 + addi AO,AO, -256 + bl ZGEMM_L2x8_K128 + b ZGEMM_L2x8_SAVE + MY_ALIGN + + +ZGEMM_L2x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble ZGEMM_L2x8_SUB2_32 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_2 0, 0 + KERNEL2x8_2 1, 0 + dcbt AO, T2 + KERNEL2x8_2 2, 0 + KERNEL2x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_2 4, 0 + KERNEL2x8_2 5, 0 + dcbt AO, T4 + KERNEL2x8_2 6, 0 + KERNEL2x8_2 7, 0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_2 8, 0 + KERNEL2x8_2 9, 0 + KERNEL2x8_2 10, 0 + KERNEL2x8_2 11, 0 + dcbt BO, T4 + KERNEL2x8_2 12, 0 + KERNEL2x8_2 13, 0 + KERNEL2x8_2 14, 0 + KERNEL2x8_2 15, 0 + KERNEL2x8_2 16, 0 + KERNEL2x8_2 17, 0 + KERNEL2x8_2 18, 0 + KERNEL2x8_2 19, 0 + KERNEL2x8_2 20, 0 + KERNEL2x8_2 21, 0 + KERNEL2x8_2 22, 0 + KERNEL2x8_2 23, 0 + KERNEL2x8_2 24, 0 + KERNEL2x8_2 25, 0 + KERNEL2x8_2 26, 0 + KERNEL2x8_2 27, 0 + KERNEL2x8_2 28, 0 + KERNEL2x8_2 29, 0 + KERNEL2x8_2 30, 0 + KERNEL2x8_2 31, 1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble ZGEMM_L2x8_SUB2_16 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_2 0, 0 + KERNEL2x8_2 1, 0 + dcbt AO, T2 + KERNEL2x8_2 2, 0 + KERNEL2x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_2 4, 0 + KERNEL2x8_2 5, 0 + dcbt AO, T4 + KERNEL2x8_2 6, 0 + KERNEL2x8_2 7, 0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_2 8, 0 + KERNEL2x8_2 9, 0 + KERNEL2x8_2 10, 0 + KERNEL2x8_2 11, 0 + dcbt BO, T4 + KERNEL2x8_2 12, 0 + KERNEL2x8_2 13, 0 + KERNEL2x8_2 14, 0 + KERNEL2x8_2 15, 1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L2x8_SUB2_8 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_2 0, 0 + KERNEL2x8_2 1, 0 + dcbt AO, T2 + KERNEL2x8_2 2, 0 + KERNEL2x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_2 4, 0 + KERNEL2x8_2 5, 0 + dcbt AO, T4 + KERNEL2x8_2 6, 0 + KERNEL2x8_2 7, 1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L2x8_SUB2_4 + KERNEL2x8_2 0, 0 + KERNEL2x8_2 1, 0 + KERNEL2x8_2 2, 0 + KERNEL2x8_2 3, 1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L2x8_SUB2_2 + KERNEL2x8_2 0, 0 + KERNEL2x8_2 1, 1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L2x8_SUB2_1 + KERNEL2x8_2 0, 1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L2x8_SAVE + LOAD_END_2x8 128, 32 + + +ZGEMM_L2x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + KERNEL2x8_UNPRIME_MMA + SAVE2x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 8, 2 +#endif + + ble ZGEMM_L2x8_SAVE_CONTINUE + b ZGEMM_L2x8_BEGIN + +ZGEMM_L2x8_SAVE_CONTINUE: + andi. T2, M, 7 + ble ZGEMM_L2x1_END + andi. T1, M, 4 + ble ZGEMM_L2x4_END + b ZGEMM_L2x4_BEGIN + MY_ALIGN + + +ZGEMM_L2x8_END: +/*----------------------------------------*/ + + +ZGEMM_L2x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble ZGEMM_L2x1_END + andi. T1, M, 4 + ble ZGEMM_L2x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 4, 2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 4, 2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + KERNEL2x4_PRELOAD + KERNEL2x4_ZERO_AND_PRIME_MMA + ble ZGEMM_L2x4_SUB0 + bl ZGEMM_2x4_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L2x4_SAVE + b ZGEMM_L2x4_SUB2 + + +ZGEMM_L2x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6, 33 +#else + andi. L, K, 63 + cmpwi K, 33 +#endif + li T8, 1 + bne CMP2x4_32K + LOAD_END_2x4 64, 32 + KERNEL2x4_PRELOAD + addi BO, BO, -64 + addi AO,AO, -128 + mtctr T8 + bl ZGEMM_L2x4_K32 + b ZGEMM_L2x4_SAVE + CMP2x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 32 +#else + cmpwi K, 32 +#endif + bne ZGEMM_L2x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -64 + addi AO,AO, -128 + bl ZGEMM_L2x4_K32 + b ZGEMM_L2x4_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L2x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L2x4_SUB2_8 + KERNEL2x4_2 0, 0 + KERNEL2x4_2 1, 0 + KERNEL2x4_2 2, 0 + KERNEL2x4_2 3, 0 + KERNEL2x4_2 4, 0 + KERNEL2x4_2 5, 0 + KERNEL2x4_2 6, 0 + KERNEL2x4_2 7, 1 + MY_ALIGN + + +ZGEMM_L2x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L2x4_SUB2_4 + KERNEL2x4_2 0, 0 + KERNEL2x4_2 1, 0 + KERNEL2x4_2 2, 0 + KERNEL2x4_2 3, 1 + MY_ALIGN + + +ZGEMM_L2x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L2x4_SUB2_2 + KERNEL2x4_2 0, 0 + KERNEL2x4_2 1, 1 + MY_ALIGN + + +ZGEMM_L2x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L2x4_SUB2_1 + KERNEL2x4_2 0, 1 + MY_ALIGN + + +ZGEMM_L2x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L2x4_SAVE + LOAD_END_2x4 64, 32 + + +ZGEMM_L2x4_SAVE: +/*----------------------------------------*/ + KERNEL2x4_UNPRIME_MMA + SAVE2x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 4, 2 +#endif + + +ZGEMM_L2x4_END: +/*----------------------------------------*/ + + +ZGEMM_L2x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble ZGEMM_L2x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 2, 2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 2, 2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + KERNEL2x2_PRELOAD + KERNEL2x2_ZERO_AND_PRIME_MMA + ble ZGEMM_L2x2_SUB0 + bl ZGEMM_2x2_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L2x2_SAVE + b ZGEMM_L2x2_SUB2 + + +ZGEMM_L2x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6, 33 +#else + andi. L, K, 63 + cmpwi K, 33 +#endif + li T8, 1 + bne CMP2x2_32K + LOAD_END_2x2 32, 32 + KERNEL2x2_PRELOAD + addi BO, BO, -64 + addi AO,AO, -64 + mtctr T8 + bl ZGEMM_L2x2_K32 + b ZGEMM_L2x2_SAVE + CMP2x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 32 +#else + cmpwi K, 32 +#endif + bne ZGEMM_L2x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -64 + addi AO,AO, -64 + bl ZGEMM_L2x2_K32 + b ZGEMM_L2x2_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L2x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L2x2_SUB2_8 + KERNEL2x2_2 0, 0 + KERNEL2x2_2 1, 0 + KERNEL2x2_2 2, 0 + KERNEL2x2_2 3, 0 + KERNEL2x2_2 4, 0 + KERNEL2x2_2 5, 0 + KERNEL2x2_2 6, 0 + KERNEL2x2_2 7, 1 + MY_ALIGN + + +ZGEMM_L2x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L2x2_SUB2_4 + KERNEL2x2_2 0, 0 + KERNEL2x2_2 1, 0 + KERNEL2x2_2 2, 0 + KERNEL2x2_2 3, 1 + MY_ALIGN + + +ZGEMM_L2x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L2x2_SUB2_2 + KERNEL2x2_2 0, 0 + KERNEL2x2_2 1, 1 + MY_ALIGN + + +ZGEMM_L2x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L2x2_SUB2_1 + KERNEL2x2_2 0, 1 + MY_ALIGN + + +ZGEMM_L2x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L2x2_SAVE + LOAD_END_2x2 32, 32 + + +ZGEMM_L2x2_SAVE: +/*----------------------------------------*/ + KERNEL2x2_UNPRIME_MMA + SAVE2x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 2, 2 +#endif + + +ZGEMM_L2x2_END: +/*----------------------------------------*/ + + +ZGEMM_L2x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble ZGEMM_L2x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 1, 2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 1, 2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x1 + ble ZGEMM_L2x1_SUB0 + bl ZGEMM_2x1_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L2x1_SAVE + b ZGEMM_L2x1_SUB2 + + +ZGEMM_L2x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6, 33 +#else + andi. L, K, 63 + cmpwi K, 33 +#endif + li T8, 1 + bne CMP2x1_32K + addi BO, BO, -32 + addi AO,AO, -16 + LOAD2x1O 16, 32 + END2x1_WITHOUT_ADD + LOAD2x1_2O 32, 64 + mtctr T8 + bl ZGEMM_L2x1_K32 + b ZGEMM_L2x1_SAVE + CMP2x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 32 +#else + cmpwi K, 32 +#endif + bne ZGEMM_L2x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -64 + addi AO,AO, -32 + LOAD2x1_2O 32, 64 + bl ZGEMM_L2x1_K32 + b ZGEMM_L2x1_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L2x1_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L2x1_SUB2_8 + LOAD2x1_2 + KERNEL2x1_L2 32, 64, 0, 0 + KERNEL2x1_L2 32, 64, 1, 0 + KERNEL2x1_L2 32, 64, 2, 0 + KERNEL2x1_L2 32, 64, 3, 0 + KERNEL2x1_L2 32, 64, 4, 0 + KERNEL2x1_L2 32, 64, 5, 0 + KERNEL2x1_L2 32, 64, 6, 0 + KERNEL2x1_E2 32, 64, 7, 1 + MY_ALIGN + + +ZGEMM_L2x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L2x1_SUB2_4 + LOAD2x1_2 + KERNEL2x1_L2 32, 64, 0, 0 + KERNEL2x1_L2 32, 64, 1, 0 + KERNEL2x1_L2 32, 64, 2, 0 + KERNEL2x1_E2 32, 64, 3, 1 + MY_ALIGN + + +ZGEMM_L2x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L2x1_SUB2_2 + LOAD2x1_2 + KERNEL2x1_L2 32, 64, 0, 0 + KERNEL2x1_E2 32, 64, 1, 1 + MY_ALIGN + + +ZGEMM_L2x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L2x1_SUB2_1 + LOAD2x1_2 + KERNEL2x1_E2 32, 64, 0, 1 + MY_ALIGN + + +ZGEMM_L2x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L2x1_SAVE + KERNEL2x1 + + +ZGEMM_L2x1_SAVE: +/*----------------------------------------*/ + SAVE2x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 1, 2 +#endif + + +ZGEMM_L2x1_END: +/*----------------------------------------*/ + slwi T1, K, 5 + addic. J, J, -1 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 2 +#endif + ble ZGEMM_L2_END + b ZGEMM_L2_BEGIN + +ZGEMM_L2_END: + +b ZGEMM_L1 +/* MINI SUBROUTINES */ +/* 1x8 MAIN 128x+2 LOOP */ + + +ZGEMM_L1x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + MY_ALIGN +ZGEMM_L1x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_2 0, 0 +ZGEMM_L1x8_K128: +/*----------------------------------------*/ + KERNEL1x8_2 1, 0 + dcbt AO, T2 + KERNEL1x8_2 2, 0 + KERNEL1x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_2 4, 0 + KERNEL1x8_2 5, 0 + dcbt AO, T4 + KERNEL1x8_2 6, 0 + KERNEL1x8_2 7, 0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_2 8, 0 + KERNEL1x8_2 9, 0 + KERNEL1x8_2 10, 0 + KERNEL1x8_2 11, 0 + dcbt BO, T4 + KERNEL1x8_2 12, 0 + KERNEL1x8_2 13, 0 + KERNEL1x8_2 14, 0 + KERNEL1x8_2 15, 0 + KERNEL1x8_2 16, 0 + KERNEL1x8_2 17, 0 + KERNEL1x8_2 18, 0 + KERNEL1x8_2 19, 0 + KERNEL1x8_2 20, 0 + KERNEL1x8_2 21, 0 + KERNEL1x8_2 22, 0 + KERNEL1x8_2 23, 0 + KERNEL1x8_2 24, 0 + KERNEL1x8_2 25, 0 + KERNEL1x8_2 26, 0 + KERNEL1x8_2 27, 0 + KERNEL1x8_2 28, 0 + KERNEL1x8_2 29, 0 + KERNEL1x8_2 30, 0 + KERNEL1x8_2 31, 0 + KERNEL1x8_2 32, 0 + KERNEL1x8_2 33, 0 + KERNEL1x8_2 34, 0 + KERNEL1x8_2 35, 0 + KERNEL1x8_2 36, 0 + KERNEL1x8_2 37, 0 + KERNEL1x8_2 38, 0 + KERNEL1x8_2 39, 0 + KERNEL1x8_2 40, 0 + KERNEL1x8_2 41, 0 + KERNEL1x8_2 42, 0 + KERNEL1x8_2 43, 0 + KERNEL1x8_2 44, 0 + KERNEL1x8_2 45, 0 + KERNEL1x8_2 46, 0 + KERNEL1x8_2 47, 0 + KERNEL1x8_2 48, 0 + KERNEL1x8_2 49, 0 + KERNEL1x8_2 50, 0 + KERNEL1x8_2 51, 0 + KERNEL1x8_2 52, 0 + KERNEL1x8_2 53, 0 + KERNEL1x8_2 54, 0 + KERNEL1x8_2 55, 0 + KERNEL1x8_2 56, 0 + KERNEL1x8_2 57, 0 + KERNEL1x8_2 58, 0 + KERNEL1x8_2 59, 0 + KERNEL1x8_2 60, 0 + KERNEL1x8_2 61, 0 + KERNEL1x8_2 62, 0 + KERNEL1x8_2 63, 1 + bdnz ZGEMM_L1x8_LOOP + MY_ALIGN +ZGEMM_L1x8_LOOP_END: +/*----------------------------------------*/ + KERNEL1x8_2 0, 1 + blr + MY_ALIGN + + +ZGEMM_1x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + MY_ALIGN + + +ZGEMM_L1x4_LOOP: +/*----------------------------------------*/ + KERNEL1x4_2 0, 0 + + +ZGEMM_L1x4_K32: +/*----------------------------------------*/ + KERNEL1x4_2 1, 0 + KERNEL1x4_2 2, 0 + KERNEL1x4_2 3, 0 + KERNEL1x4_2 4, 0 + KERNEL1x4_2 5, 0 + KERNEL1x4_2 6, 0 + KERNEL1x4_2 7, 0 + KERNEL1x4_2 8, 0 + KERNEL1x4_2 9, 0 + KERNEL1x4_2 10, 0 + KERNEL1x4_2 11, 0 + KERNEL1x4_2 12, 0 + KERNEL1x4_2 13, 0 + KERNEL1x4_2 14, 0 + KERNEL1x4_2 15, 1 + bdnz ZGEMM_L1x4_LOOP + MY_ALIGN + + +ZGEMM_L1x4_LOOP_END: +/*----------------------------------------*/ + KERNEL1x4_2 0, 1 + blr + MY_ALIGN + + +ZGEMM_1x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + MY_ALIGN + + +ZGEMM_L1x2_LOOP: +/*----------------------------------------*/ + KERNEL1x2_2 0, 0 + + +ZGEMM_L1x2_K32: +/*----------------------------------------*/ + KERNEL1x2_2 1, 0 + KERNEL1x2_2 2, 0 + KERNEL1x2_2 3, 0 + KERNEL1x2_2 4, 0 + KERNEL1x2_2 5, 0 + KERNEL1x2_2 6, 0 + KERNEL1x2_2 7, 0 + KERNEL1x2_2 8, 0 + KERNEL1x2_2 9, 0 + KERNEL1x2_2 10, 0 + KERNEL1x2_2 11, 0 + KERNEL1x2_2 12, 0 + KERNEL1x2_2 13, 0 + KERNEL1x2_2 14, 0 + KERNEL1x2_2 15, 1 + bdnz ZGEMM_L1x2_LOOP + MY_ALIGN + + +ZGEMM_L1x2_LOOP_END: +/*----------------------------------------*/ + KERNEL1x2_2 0, 1 + blr + MY_ALIGN + + +ZGEMM_1x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x1_2 + MY_ALIGN + + +ZGEMM_L1x1_LOOP: +/*----------------------------------------*/ + KERNEL1x1_L2 32, 32, 0, 0 + + +ZGEMM_L1x1_K32: +/*----------------------------------------*/ + KERNEL1x1_L2 32, 32, 1, 0 + KERNEL1x1_L2 32, 32, 2, 0 + KERNEL1x1_L2 32, 32, 3, 0 + KERNEL1x1_L2 32, 32, 4, 0 + KERNEL1x1_L2 32, 32, 5, 0 + KERNEL1x1_L2 32, 32, 6, 0 + KERNEL1x1_L2 32, 32, 7, 0 + KERNEL1x1_L2 32, 32, 8, 0 + KERNEL1x1_L2 32, 32, 9, 0 + KERNEL1x1_L2 32, 32, 10, 0 + KERNEL1x1_L2 32, 32, 11, 0 + KERNEL1x1_L2 32, 32, 12, 0 + KERNEL1x1_L2 32, 32, 13, 0 + KERNEL1x1_L2 32, 32, 14, 0 + KERNEL1x1_L2 32, 32, 15, 1 + bdnz ZGEMM_L1x1_LOOP + MY_ALIGN + + +ZGEMM_L1x1_LOOP_END: +/*----------------------------------------*/ + END1x1_2 + blr + MY_ALIGN + + +/*----------------------N1 BEGINS---------*/ +ZGEMM_L1: +/*----------------------------------------*/ + andi. T1, N, 1 + ble ZGEMM_L1_END + +ZGEMM_L1_BEGIN: +/*----------------------------------------*/ + mr CO, C + + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble ZGEMM_L1x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +ZGEMM_L1x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 8, 1 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 8, 1 + mr T1, T6 +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T11-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + KERNEL1x8_ZERO_AND_PRIME_MMA + ble ZGEMM_L1x8_SUB0 + bl ZGEMM_L1x8_LMAIN_SUB + andi. L, T1, 127 + ble ZGEMM_L1x8_SAVE + b ZGEMM_L1x8_SUB2 + + +ZGEMM_L1x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6, 129 +#else + andi. L, K, 255 + cmpwi K, 129 +#endif + li T8, 1 + bne CMP1x8_128K + LOAD_END_1x8 -128, -16 + mtctr T8 + bl ZGEMM_L1x8_K128 + b ZGEMM_L1x8_SAVE + CMP1x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 128 +#else + cmpwi K, 128 +#endif + bne ZGEMM_L1x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -32 + addi AO,AO, -256 + bl ZGEMM_L1x8_K128 + b ZGEMM_L1x8_SAVE + MY_ALIGN + + +ZGEMM_L1x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble ZGEMM_L1x8_SUB2_32 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_2 0, 0 + KERNEL1x8_2 1, 0 + dcbt AO, T2 + KERNEL1x8_2 2, 0 + KERNEL1x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_2 4, 0 + KERNEL1x8_2 5, 0 + dcbt AO, T4 + KERNEL1x8_2 6, 0 + KERNEL1x8_2 7, 0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_2 8, 0 + KERNEL1x8_2 9, 0 + KERNEL1x8_2 10, 0 + KERNEL1x8_2 11, 0 + dcbt BO, T4 + KERNEL1x8_2 12, 0 + KERNEL1x8_2 13, 0 + KERNEL1x8_2 14, 0 + KERNEL1x8_2 15, 0 + KERNEL1x8_2 16, 0 + KERNEL1x8_2 17, 0 + KERNEL1x8_2 18, 0 + KERNEL1x8_2 19, 0 + KERNEL1x8_2 20, 0 + KERNEL1x8_2 21, 0 + KERNEL1x8_2 22, 0 + KERNEL1x8_2 23, 0 + KERNEL1x8_2 24, 0 + KERNEL1x8_2 25, 0 + KERNEL1x8_2 26, 0 + KERNEL1x8_2 27, 0 + KERNEL1x8_2 28, 0 + KERNEL1x8_2 29, 0 + KERNEL1x8_2 30, 0 + KERNEL1x8_2 31, 1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble ZGEMM_L1x8_SUB2_16 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_2 0, 0 + KERNEL1x8_2 1, 0 + dcbt AO, T2 + KERNEL1x8_2 2, 0 + KERNEL1x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_2 4, 0 + KERNEL1x8_2 5, 0 + dcbt AO, T4 + KERNEL1x8_2 6, 0 + KERNEL1x8_2 7, 0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_2 8, 0 + KERNEL1x8_2 9, 0 + KERNEL1x8_2 10, 0 + KERNEL1x8_2 11, 0 + dcbt BO, T4 + KERNEL1x8_2 12, 0 + KERNEL1x8_2 13, 0 + KERNEL1x8_2 14, 0 + KERNEL1x8_2 15, 1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L1x8_SUB2_8 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_2 0, 0 + KERNEL1x8_2 1, 0 + dcbt AO, T2 + KERNEL1x8_2 2, 0 + KERNEL1x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_2 4, 0 + KERNEL1x8_2 5, 0 + dcbt AO, T4 + KERNEL1x8_2 6, 0 + KERNEL1x8_2 7, 1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L1x8_SUB2_4 + KERNEL1x8_2 0, 0 + KERNEL1x8_2 1, 0 + KERNEL1x8_2 2, 0 + KERNEL1x8_2 3, 1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L1x8_SUB2_2 + KERNEL1x8_2 0, 0 + KERNEL1x8_2 1, 1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L1x8_SUB2_1 + KERNEL1x8_2 0, 1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L1x8_SAVE + LOAD_END_1x8 128, 16 + + +ZGEMM_L1x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + KERNEL1x8_UNPRIME_MMA + SAVE1x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 8, 1 +#endif + bgt ZGEMM_L1x8_BEGIN + andi. T2, M, 7 + ble ZGEMM_L1x1_END + andi. T1, M, 4 + ble ZGEMM_L1x4_END + b ZGEMM_L1x4_BEGIN + MY_ALIGN + + +ZGEMM_L1x8_END: +/*----------------------------------------*/ + + +ZGEMM_L1x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble ZGEMM_L1x1_END + andi. T1, M, 4 + ble ZGEMM_L1x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 4, 1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 4, 1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + KERNEL1x4_ZERO_AND_PRIME_MMA + ble ZGEMM_L1x4_SUB0 + bl ZGEMM_1x4_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L1x4_SAVE + b ZGEMM_L1x4_SUB2 + + +ZGEMM_L1x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6, 33 +#else + andi. L, K, 63 + cmpwi K, 33 +#endif + li T8, 1 + bne CMP1x4_32K + LOAD_END_1x4 -64, -16 + mtctr T8 + bl ZGEMM_L1x4_K32 + b ZGEMM_L1x4_SAVE + CMP1x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 32 +#else + cmpwi K, 32 +#endif + bne ZGEMM_L1x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -32 + addi AO,AO, -128 + bl ZGEMM_L1x4_K32 + b ZGEMM_L1x4_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L1x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L1x4_SUB2_8 + KERNEL1x4_2 0, 0 + KERNEL1x4_2 1, 0 + KERNEL1x4_2 2, 0 + KERNEL1x4_2 3, 0 + KERNEL1x4_2 4, 0 + KERNEL1x4_2 5, 0 + KERNEL1x4_2 6, 0 + KERNEL1x4_2 7, 1 + MY_ALIGN + + +ZGEMM_L1x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L1x4_SUB2_4 + KERNEL1x4_2 0, 0 + KERNEL1x4_2 1, 0 + KERNEL1x4_2 2, 0 + KERNEL1x4_2 3, 1 + MY_ALIGN + + +ZGEMM_L1x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L1x4_SUB2_2 + KERNEL1x4_2 0, 0 + KERNEL1x4_2 1, 1 + MY_ALIGN + + +ZGEMM_L1x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L1x4_SUB2_1 + KERNEL1x4_2 0, 1 + MY_ALIGN + + +ZGEMM_L1x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L1x4_SAVE + LOAD_END_1x4 64,16 + + + +ZGEMM_L1x4_SAVE: +/*----------------------------------------*/ + KERNEL1x4_UNPRIME_MMA + SAVE1x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 4, 1 +#endif + + +ZGEMM_L1x4_END: +/*----------------------------------------*/ + + +ZGEMM_L1x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble ZGEMM_L1x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 2, 1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 2, 1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + KERNEL1x2_ZERO_AND_PRIME_MMA + ble ZGEMM_L1x2_SUB0 + bl ZGEMM_1x2_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L1x2_SAVE + b ZGEMM_L1x2_SUB2 + + +ZGEMM_L1x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6, 33 +#else + andi. L, K, 63 + cmpwi K, 33 +#endif + li T8, 1 + bne CMP1x2_32K + LOAD_END_1x2 -32, -16 + mtctr T8 + bl ZGEMM_L1x2_K32 + b ZGEMM_L1x2_SAVE + CMP1x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 32 +#else + cmpwi K, 32 +#endif + bne ZGEMM_L1x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -32 + addi AO,AO, -64 + bl ZGEMM_L1x2_K32 + b ZGEMM_L1x2_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L1x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L1x2_SUB2_8 + KERNEL1x2_2 0, 0 + KERNEL1x2_2 1, 0 + KERNEL1x2_2 2, 0 + KERNEL1x2_2 3, 0 + KERNEL1x2_2 4, 0 + KERNEL1x2_2 5, 0 + KERNEL1x2_2 6, 0 + KERNEL1x2_2 7, 1 + MY_ALIGN + + +ZGEMM_L1x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L1x2_SUB2_4 + KERNEL1x2_2 0, 0 + KERNEL1x2_2 1, 0 + KERNEL1x2_2 2, 0 + KERNEL1x2_2 3, 1 + MY_ALIGN + + +ZGEMM_L1x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L1x2_SUB2_2 + KERNEL1x2_2 0, 0 + KERNEL1x2_2 1, 1 + MY_ALIGN + + +ZGEMM_L1x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L1x2_SUB2_1 + KERNEL1x2_2 0, 1 + MY_ALIGN + + +ZGEMM_L1x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L1x2_SAVE + LOAD_END_1x2 32,16 + + +ZGEMM_L1x2_SAVE: +/*----------------------------------------*/ + KERNEL1x2_UNPRIME_MMA + SAVE1x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 2, 1 +#endif + + +ZGEMM_L1x2_END: +/*----------------------------------------*/ + + +ZGEMM_L1x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble ZGEMM_L1x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 1, 1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 1, 1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO1x1 + ble ZGEMM_L1x1_SUB0 + bl ZGEMM_1x1_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L1x1_SAVE + b ZGEMM_L1x1_SUB2 + + +ZGEMM_L1x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6, 33 +#else + andi. L, K, 63 + cmpwi K, 33 +#endif + li T8, 1 + bne CMP1x1_32K + addi BO, BO, -16 + addi AO,AO, -16 + LOAD1x1O 16, 16 + END1x1_WITHOUT_ADD + LOAD1x1_2O 32, 32 + mtctr T8 + bl ZGEMM_L1x1_K32 + b ZGEMM_L1x1_SAVE + CMP1x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 32 +#else + cmpwi K, 32 +#endif + bne ZGEMM_L1x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -32 + addi AO,AO, -32 + LOAD1x1_2O 32, 32 + bl ZGEMM_L1x1_K32 + b ZGEMM_L1x1_SAVE + MY_ALIGN + + +ZGEMM_L1x1_SUB2: +/*----------------------------------------*/ + andi. T1, L, 16 + ble ZGEMM_L1x1_SUB2_8 + LOAD1x1_2 + KERNEL1x1_L2 32, 32, 0, 0 + KERNEL1x1_L2 32, 32, 1, 0 + KERNEL1x1_L2 32, 32, 2, 0 + KERNEL1x1_L2 32, 32, 3, 0 + KERNEL1x1_L2 32, 32, 4, 0 + KERNEL1x1_L2 32, 32, 5, 0 + KERNEL1x1_L2 32, 32, 6, 0 + KERNEL1x1_E2 32, 32, 7, 1 + MY_ALIGN + + +ZGEMM_L1x1_SUB2_8: +/*----------------------------------------*/ + andi. T1, L, 8 + ble ZGEMM_L1x1_SUB2_4 + LOAD1x1_2 + KERNEL1x1_L2 32, 32, 0, 0 + KERNEL1x1_L2 32, 32, 1, 0 + KERNEL1x1_L2 32, 32, 2, 0 + KERNEL1x1_E2 32, 32, 3, 1 + MY_ALIGN + + +ZGEMM_L1x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L1x1_SUB2_2 + LOAD1x1_2 + KERNEL1x1_L2 32, 32, 0, 0 + KERNEL1x1_E2 32, 32, 1, 1 + MY_ALIGN + + +ZGEMM_L1x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L1x1_SUB2_1 + LOAD1x1_2 + KERNEL1x1_E2 32, 32, 0, 1 + MY_ALIGN + + +ZGEMM_L1x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L1x1_SAVE + KERNEL1x1 + + +ZGEMM_L1x1_SAVE: +/*----------------------------------------*/ + SAVE1x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 1, 1 +#endif + + +ZGEMM_L1x1_END: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 1 +#endif + + +ZGEMM_L1_END: +/*----------------------------------------*/ diff --git a/kernel/power/zgemm_macros_power10.S b/kernel/power/zgemm_macros_power10.S new file mode 100644 index 000000000..42f9c5ad4 --- /dev/null +++ b/kernel/power/zgemm_macros_power10.S @@ -0,0 +1,1138 @@ +/*************************************************************************** +Copyright (c) 2013-2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define unit_size 16 +#define DISP32(ind,disp) (ind*unit_size*32+disp) +#define DISP16(ind,disp) (ind*unit_size*16+disp) +#define DISP8(ind,disp) (ind*unit_size*8+disp) +#define DISP4(ind,disp) (ind*unit_size*4+disp) +#define DISP2(ind,disp) (ind*unit_size*2+disp) +#define DISP1(ind,disp) (ind*unit_size+disp) +#define DISPX(disp) (disp) +/* HELPERS FOR SAVE */ +/* {r0,i0} and {r1,i1} into {r0,r1} {i0,i1} */ + + +.macro LOAD_COUPLE_AS_RR_II VS_OUT1,VS_OUT2,VS_TEMP1,VS_TEMP2,REG,LOFFSET +#ifndef TRMMKERNEL + lxv \VS_TEMP1, DISPX(\LOFFSET)(\REG) + lxv \VS_TEMP2, DISPX(\LOFFSET+16)(\REG) + xxmrgld \VS_OUT1,\VS_TEMP1,\VS_TEMP2 + xxmrghd \VS_OUT2,\VS_TEMP1,\VS_TEMP2 +#endif +.endm +/*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/ + + +.macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2 + xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/ + xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/ +.endm +/*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/ + + +.macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2 + xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*imag */ + xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/ +.endm +/* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/ + + +.macro AGGREGATE_REALS_IMAGES VSINR_OUT1,VSINR,VSINI_OUT2,VSINI +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + xvsubdp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubdp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) + xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubdp \VSINI_OUT2,\VSINI,\VSINI_OUT2 +#else // CC || CR || RC || RR + /*we will assume {-alpha_r,-alpha_i} for this case */ + /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ + xvsubdp \VSINR_OUT1,\VSINR,\VSINR_OUT1 + /*we will negate alpha image instead instead to fix sign*/ + xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#endif +.endm +/* {i0,i1} * {alpha_i,alpha_i} - VSOUT1 ;VSOUT2 + {r0,r1}*{alpha_i,alpha_i} */ + + +.macro MULT_APLHA_PART1 VSINRR,VSINII,VSOUT1,VSOUT2 +#ifndef TRMMKERNEL + xvmsubadp \VSOUT1,\VSINII, alpha_i + xvmaddadp \VSOUT2,\VSINRR, alpha_i +#else + xvmuldp \VSOUT1,\VSINII, alpha_i + xvmuldp \VSOUT2,\VSINRR, alpha_i +#endif +.endm +/* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */ + + +.macro MULT_APLHA_PART2 VSINRR,VSINII,VSOUT1,VSOUT2 + xvmsubadp \VSOUT1,\VSINRR, alpha_r + xvmaddadp \VSOUT2,\VSINII, alpha_r +.endm +/* unpack to store 2{r,r} {i,i} into {r,i} {r,i} (big endian because of stxv) */ + + +.macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2 + xxmrghd \VSOUT1,\VSIN2,\VSIN1 + xxmrgld \VSOUT2,\VSIN2,\VSIN1 +.endm + + +.macro STORE_COUPLE REG,LOFFSET,VSIN1,VSIN2 + stxv \VSIN1, DISPX(\LOFFSET)(\REG) + stxv \VSIN2, DISPX(\LOFFSET+16)(\REG) +.endm + + +.macro SAVE8 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,VSRes9,VSRes10,VSRes11,VSRes12,VSRes13,VSRes14,VSRes15,VSRes16,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs34,vs35 + LOAD_COUPLE_AS_RR_II vs46,vs47,vs50,vs51,\BASE_REG,\LOFFSET + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs36,vs37 + LOAD_COUPLE_AS_RR_II vs48,vs49,vs52,vs53,\BASE_REG,(\LOFFSET+32) + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs38,vs39 + LOAD_COUPLE_AS_RR_II vs56,vs57,vs50,vs51,\BASE_REG,(\LOFFSET +64) + RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs40,vs41 + LOAD_COUPLE_AS_RR_II vs58,vs59,vs52,vs53,\BASE_REG,(\LOFFSET+96) + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes9,\VSRes11,vs42,vs43 + AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37 + RESULT_INTO_REALIMAG_IMAGREAL \VSRes10,\VSRes12,vs44,vs45 + AGGREGATE_REALS_IMAGES vs38,vs39,vs40,vs41 + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes13,\VSRes15,\VSRes1,\VSRes2 + MULT_APLHA_PART1 vs34,vs36, vs46,vs47 + RESULT_INTO_REALIMAG_IMAGREAL \VSRes14,\VSRes16,\VSRes3,\VSRes4 + MULT_APLHA_PART1 vs38,vs40,vs48,vs49 + MULT_APLHA_PART2 vs34,vs36,vs46,vs47 + AGGREGATE_REALS_IMAGES vs42,vs43,vs44,vs45 + MULT_APLHA_PART2 vs38,vs40,vs48,vs49 + AGGREGATE_REALS_IMAGES \VSRes1,\VSRes2,\VSRes3,\VSRes4 + UNPACK_FOR_STORE vs46,vs47,vs39,vs41 + MULT_APLHA_PART1 vs42,vs44, vs56,vs57 + UNPACK_FOR_STORE vs48,vs49,vs35,vs37 + MULT_APLHA_PART1 \VSRes1,\VSRes3, vs58,vs59 + STORE_COUPLE \BASE_REG,\LOFFSET,vs39,vs41 + MULT_APLHA_PART2 vs42,vs44,vs56,vs57 + STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs35,vs37 + MULT_APLHA_PART2 \VSRes1,\VSRes3, vs58,vs59 + UNPACK_FOR_STORE vs56,vs57,vs42,vs44 + UNPACK_FOR_STORE vs58,vs59,\VSRes1,\VSRes3 + STORE_COUPLE \BASE_REG,(\LOFFSET +64),vs42,vs44 + STORE_COUPLE \BASE_REG,(\LOFFSET+96),\VSRes1,\VSRes3 +.endm + + +.macro SAVE4 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs34,vs35 + LOAD_COUPLE_AS_RR_II vs46,vs47,vs50,vs51,\BASE_REG,\LOFFSET + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs36,vs37 + LOAD_COUPLE_AS_RR_II vs48,vs49,vs52,vs53,\BASE_REG,(\LOFFSET+32) + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs38,vs39 + RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs40,vs41 + AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37 + AGGREGATE_REALS_IMAGES vs38,vs39,vs40,vs41 + MULT_APLHA_PART1 vs34,vs36, vs46,vs47 + MULT_APLHA_PART1 vs38,vs40, vs48,vs49 + MULT_APLHA_PART2 vs34,vs36, vs46,vs47 + MULT_APLHA_PART2 vs38,vs40,vs48,vs49 + UNPACK_FOR_STORE vs46,vs47,vs39,vs41 + UNPACK_FOR_STORE vs48,vs49,vs35,vs37 + STORE_COUPLE \BASE_REG,\LOFFSET,vs39,vs41 + STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs35,vs37 +.endm + + +.macro SAVE2 VSRes1,VSRes2,VSRes3,VSRes4,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs34,vs35 + LOAD_COUPLE_AS_RR_II vs46,vs47,vs50,vs51,\BASE_REG,\LOFFSET + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs36,vs37 + AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37 + MULT_APLHA_PART1 vs34,vs36, vs46,vs47 + MULT_APLHA_PART2 vs34,vs36, vs46,vs47 + UNPACK_FOR_STORE vs46,vs47,vs39,vs41 + STORE_COUPLE \BASE_REG,\LOFFSET,vs39,vs41 +.endm + + +.macro SAVE1 VSRes1,VSRes2,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs34,vs35 +#ifndef TRMMKERNEL + lxv vs50, (\LOFFSET)(\BASE_REG) + xxmrgld vs46,vs50,vs50 + xxmrghd vs47,vs50,vs50 +#endif + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes2,vs36,vs37 + AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37 + MULT_APLHA_PART1 vs34,vs36, vs46,vs47 + MULT_APLHA_PART2 vs34,vs36, vs46,vs47 + UNPACK_FOR_STORE vs46,vs47,vs39,vs41 + xxmrghd vs39,vs47,vs46 + stxv vs39, (\LOFFSET)(\BASE_REG) +.endm + +/********************************************************************************************** +* + +.macros for N=2 and M=8 +**********************************************************************************************/ + +.macro KERNEL2x8_ZERO_AND_PRIME_MMA + /* zero out and prime the MMA accumulators */ + xxsetaccz 0 + xxsetaccz 1 + xxsetaccz 2 + xxsetaccz 3 + xxsetaccz 4 + xxsetaccz 5 + xxsetaccz 6 + xxsetaccz 7 +.endm + + +.macro KERNEL2x8_PRELOAD + lxvp vs32, 0(AO) // load real,imag from A + lxvp vs34, 32(AO) // load real,imag from A + lxvp vs36, 64(AO) // load real,imag from A + lxvp vs38, 96(AO) // load real,imag from A + lxvp vs48, 0(BO) // load real imag from B +.endm + + +.macro KERNEL2x8_2 Index, IsLast + lxvp vs40, DISP16(\Index,128)(AO) // load real,imag from A + lxvp vs42, DISP16(\Index,160)(AO) // load real,imag from A + lxvp vs44, DISP16(\Index,192)(AO) // load real,imag from A + lxvp vs46, DISP16(\Index,224)(AO) // load real,imag from A + lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 2, vs36, vs49 + xvf64gerpp 3, vs38, vs49 + xvf64gerpp 4, vs32, vs48 + xvf64gerpp 5, vs34, vs48 + xvf64gerpp 6, vs36, vs48 + xvf64gerpp 7, vs38, vs48 + lxvp vs32, DISP16(\Index, 256)(AO) // load real,imag from A + lxvp vs34, DISP16(\Index, 288)(AO) // load real,imag from A + lxvp vs36, DISP16(\Index, 320)(AO) // load real,imag from A + lxvp vs38, DISP16(\Index, 352)(AO) // load real,imag from A + lxvp vs48, DISP4(\Index, 64)(BO) // load real imag from B + xvf64gerpp 0, vs40, vs51 + xvf64gerpp 1, vs42, vs51 + xvf64gerpp 2, vs44, vs51 + xvf64gerpp 3, vs46, vs51 + xvf64gerpp 4, vs40, vs50 + xvf64gerpp 5, vs42, vs50 + xvf64gerpp 6, vs44, vs50 + xvf64gerpp 7, vs46, vs50 +.if \IsLast==1 + addi AO, AO, DISP16(\Index,256) + addi BO, BO, DISP4(\Index,64) +.endif +.endm + + +.macro LOAD_END_2x8 OffsetA,OffsetB + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 2, vs36, vs49 + xvf64gerpp 3, vs38, vs49 + xvf64gerpp 4, vs32, vs48 + xvf64gerpp 5, vs34, vs48 + xvf64gerpp 6, vs36, vs48 + xvf64gerpp 7, vs38, vs48 + addi BO, BO, \OffsetB + addi AO, AO, \OffsetA +.endm + + +.macro KERNEL2x8_UNPRIME_MMA + /* "unprime" MMA accumulators */ + xxmfacc 0 + xxmfacc 1 + xxmfacc 2 + xxmfacc 3 + xxmfacc 4 + xxmfacc 5 + xxmfacc 6 + xxmfacc 7 +.endm + + +.macro SAVE2x8 + add T1, CO ,LDC + xxpermdi vs32, vs0, vs1, 0b01 + xxpermdi vs33, vs0, vs1, 0b10 + xxpermdi vs34, vs2, vs3, 0b01 + xxpermdi vs35, vs2, vs3, 0b10 + xxpermdi vs36, vs4, vs5, 0b01 + xxpermdi vs37, vs4, vs5, 0b10 + xxpermdi vs38, vs6, vs7, 0b01 + xxpermdi vs39, vs6, vs7, 0b10 + xxpermdi vs40, vs8, vs9, 0b01 + xxpermdi vs41, vs8, vs9, 0b10 + xxpermdi vs42, vs10, vs11, 0b01 + xxpermdi vs43, vs10, vs11, 0b10 + xxpermdi vs44, vs12, vs13, 0b01 + xxpermdi vs45, vs12, vs13, 0b10 + xxpermdi vs46, vs14, vs15, 0b01 + xxpermdi vs47, vs14, vs15, 0b10 + + xxlor vs2, vs32, vs32 + xxlor vs3, vs33, vs33 + xxlor vs0, vs34, vs34 + xxlor vs1, vs35, vs35 + xxlor vs6, vs36, vs36 + xxlor vs7, vs37, vs37 + xxlor vs4, vs38, vs38 + xxlor vs5, vs39, vs39 + xxlor vs10, vs40, vs40 + xxlor vs11, vs41, vs41 + xxlor vs8, vs42, vs42 + xxlor vs9, vs43, vs43 + xxlor vs14, vs44, vs44 + xxlor vs15, vs45, vs45 + xxlor vs12, vs46, vs46 + xxlor vs13, vs47, vs47 + + xxpermdi vs32, vs16, vs17, 0b01 + xxpermdi vs33, vs16, vs17, 0b10 + xxpermdi vs34, vs18, vs19, 0b01 + xxpermdi vs35, vs18, vs19, 0b10 + xxpermdi vs36, vs20, vs21, 0b01 + xxpermdi vs37, vs20, vs21, 0b10 + xxpermdi vs38, vs22, vs23, 0b01 + xxpermdi vs39, vs22, vs23, 0b10 + xxpermdi vs40, vs24, vs25, 0b01 + xxpermdi vs41, vs24, vs25, 0b10 + xxpermdi vs42, vs26, vs27, 0b01 + xxpermdi vs43, vs26, vs27, 0b10 + xxpermdi vs44, vs28, vs29, 0b01 + xxpermdi vs45, vs28, vs29, 0b10 + xxpermdi vs46, vs30, vs31, 0b01 + xxpermdi vs47, vs30, vs31, 0b10 + + xxlor vs18, vs32, vs32 + xxlor vs19, vs33, vs33 + xxlor vs16, vs34, vs34 + xxlor vs17, vs35, vs35 + xxlor vs22, vs36, vs36 + xxlor vs23, vs37, vs37 + xxlor vs20, vs38, vs38 + xxlor vs21, vs39, vs39 + xxlor vs26, vs40, vs40 + xxlor vs27, vs41, vs41 + xxlor vs24, vs42, vs42 + xxlor vs25, vs43, vs43 + xxlor vs30, vs44, vs44 + xxlor vs31, vs45, vs45 + xxlor vs28, vs46, vs46 + xxlor vs29, vs47, vs47 + + SAVE8 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0 + SAVE8 vs16,vs17,vs18,vs19,vs20,vs21,vs22,vs23,vs24,vs25,vs26,vs27,vs28,vs29,vs30,vs31,T1,0 + addi CO, CO, 128 +.endm + +/********************************************************************************************** +* + +.macros for N=2 and M=4 +**********************************************************************************************/ + +.macro KERNEL2x4_ZERO_AND_PRIME_MMA + /* zero out and prime the MMA accumulators */ + xxsetaccz 0 + xxsetaccz 1 + xxsetaccz 2 + xxsetaccz 3 +.endm + + +.macro KERNEL2x4_PRELOAD + lxvp vs32, 0(AO) // load real,imag from A + lxvp vs34, 32(AO) // load real,imag from A + lxvp vs48, 0(BO) // load real imag from B +.endm + + +.macro KERNEL2x4_2 Index, IsLast + lxvp vs40, DISP8(\Index, 64)(AO) // load real,imag from A + lxvp vs42, DISP8(\Index, 96)(AO) // load real,imag from A + lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 2, vs32, vs48 + xvf64gerpp 3, vs34, vs48 + lxvp vs32, DISP8(\Index, 128)(AO) // load real,imag from A + lxvp vs34, DISP8(\Index, 160)(AO) // load real,imag from A + lxvp vs48, DISP4(\Index, 64)(BO) // load real,imag from B + xvf64gerpp 0, vs40, vs51 + xvf64gerpp 1, vs42, vs51 + xvf64gerpp 2, vs40, vs50 + xvf64gerpp 3, vs42, vs50 +.if \IsLast==1 + addi AO, AO, DISP8(\Index,128) + addi BO, BO, DISP4(\Index,64) +.endif +.endm + + +.macro LOAD_END_2x4 OffsetA, OffsetB + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 2, vs32, vs48 + xvf64gerpp 3, vs34, vs48 + addi BO, BO, \OffsetB + addi AO, AO, \OffsetA +.endm + + +.macro KERNEL2x4_UNPRIME_MMA + /* "unprime" MMA accumulators */ + xxmfacc 0 + xxmfacc 1 + xxmfacc 2 + xxmfacc 3 +.endm + + +.macro SAVE2x4 + add T1, CO ,LDC + xxpermdi vs32, vs0, vs1, 0b01 + xxpermdi vs33, vs0, vs1, 0b10 + xxpermdi vs34, vs2, vs3, 0b01 + xxpermdi vs35, vs2, vs3, 0b10 + xxpermdi vs36, vs4, vs5, 0b01 + xxpermdi vs37, vs4, vs5, 0b10 + xxpermdi vs38, vs6, vs7, 0b01 + xxpermdi vs39, vs6, vs7, 0b10 + xxpermdi vs40, vs8, vs9, 0b01 + xxpermdi vs41, vs8, vs9, 0b10 + xxpermdi vs42, vs10, vs11, 0b01 + xxpermdi vs43, vs10, vs11, 0b10 + xxpermdi vs44, vs12, vs13, 0b01 + xxpermdi vs45, vs12, vs13, 0b10 + xxpermdi vs46, vs14, vs15, 0b01 + xxpermdi vs47, vs14, vs15, 0b10 + + xxlor vs2, vs32, vs32 + xxlor vs3, vs33, vs33 + xxlor vs0, vs34, vs34 + xxlor vs1, vs35, vs35 + xxlor vs6, vs36, vs36 + xxlor vs7, vs37, vs37 + xxlor vs4, vs38, vs38 + xxlor vs5, vs39, vs39 + xxlor vs10, vs40, vs40 + xxlor vs11, vs41, vs41 + xxlor vs8, vs42, vs42 + xxlor vs9, vs43, vs43 + xxlor vs14, vs44, vs44 + xxlor vs15, vs45, vs45 + xxlor vs12, vs46, vs46 + xxlor vs13, vs47, vs47 + + SAVE4 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0 + SAVE4 vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,T1,0 + addi CO, CO, 64 +.endm + +/********************************************************************************************** +* + +.macros for N=2 and M=2 +**********************************************************************************************/ + +.macro KERNEL2x2_ZERO_AND_PRIME_MMA + /* zero out and prime the MMA accumulators */ + xxsetaccz 0 + xxsetaccz 1 +.endm + + +.macro KERNEL2x2_PRELOAD + lxvp vs32, 0(AO) // load real,imag from A + lxvp vs48, 0(BO) // load real imag from B +.endm + + +.macro KERNEL2x2_2 Index, IsLast + lxvp vs40, DISP4(\Index, 32)(AO) // load real,imag from A + lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs32, vs48 + lxvp vs32, DISP4(\Index, 64)(AO) // load real,imag from A + lxvp vs48, DISP4(\Index, 64)(BO) // load real imag from B + xvf64gerpp 0, vs40, vs51 + xvf64gerpp 1, vs40, vs50 +.if \IsLast==1 + addi AO, AO, DISP4(\Index,64) + addi BO, BO, DISP4(\Index,64) +.endif +.endm + + +.macro LOAD_END_2x2 OffsetA,OffsetB + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs32, vs48 + addi BO, BO, \OffsetB + addi AO, AO, \OffsetA +.endm + + +.macro KERNEL2x2_UNPRIME_MMA + /* "unprime" MMA accumulators */ + xxmfacc 0 + xxmfacc 1 +.endm + + +.macro SAVE2x2 + add T1, CO ,LDC + xxpermdi vs32, vs0, vs1, 0b01 + xxpermdi vs33, vs0, vs1, 0b10 + xxpermdi vs34, vs2, vs3, 0b01 + xxpermdi vs35, vs2, vs3, 0b10 + xxpermdi vs36, vs4, vs5, 0b01 + xxpermdi vs37, vs4, vs5, 0b10 + xxpermdi vs38, vs6, vs7, 0b01 + xxpermdi vs39, vs6, vs7, 0b10 + + xxlor vs2, vs32, vs32 + xxlor vs3, vs33, vs33 + xxlor vs0, vs34, vs34 + xxlor vs1, vs35, vs35 + xxlor vs6, vs36, vs36 + xxlor vs7, vs37, vs37 + xxlor vs4, vs38, vs38 + xxlor vs5, vs39, vs39 + + SAVE2 vs0,vs1,vs2,vs3,CO,0 + SAVE2 vs4,vs5,vs6,vs7,T1,0 + addi CO, CO, 32 +.endm + +/********************************************************************************************** +* + +.macros for N=2 and M=1 +**********************************************************************************************/ + +.macro ZERO2x1 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2, vs2 + xxlxor vs3, vs3, vs3 + +.endm + + +.macro LOAD2x1 + LOAD2x1O 0,0 +.endm + + +.macro LOAD2x1O OffsetA,OffsetB + lxv vs48,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs50, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs49, vs48 + xxswapd vs51, vs50 + lxv vs32, (0+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END2x1_WITHOUT_ADD + END2x1 AO,BO,0,0 +.endm + + +.macro END2x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddadp vs0, vs32, vs48 + xvmaddadp vs2, vs32, vs50 + xvmaddadp vs1, vs32, vs49 + xvmaddadp vs3, vs32, vs51 +.endm + + +.macro LOAD2x1_2 + LOAD2x1_2O 0,0 +.endm + + +.macro LOAD2x1_2O OffsetA,OffsetB + lxv vs48,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs50, (\OffsetB+16)(BO) // load real,imag from B + lxv vs52, (\OffsetB+32)(BO) // load real,imag from B + lxv vs54, (\OffsetB+48)(BO) // load real,imag from B + xxswapd vs49, vs48 + xxswapd vs51, vs50 + lxv vs32, (0+\OffsetA)(AO) // load real,imag from A + lxv vs40, (16+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END2x1_2 + /*for load2 offset will be 32 and 64*/ + KERNEL2x1_2 AO,BO, 32,64,0 ,1,1 +.endm + + +.macro KERNEL2x1_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xxswapd vs53, vs52 + xxswapd vs55, vs54 + xvmaddadp vs0, vs32, vs48 + xvmaddadp vs2, vs32, vs50 + xvmaddadp vs1, vs32, vs49 + xvmaddadp vs3, vs32, vs51 +.if \Complete==0 + lxv vs32, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs48, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B + lxv vs50, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \Complete==0 + xxswapd vs49, vs48 + xxswapd vs51, vs50 +.endif + xvmaddadp vs0, vs40, vs52 + xvmaddadp vs2, vs40, vs54 + xvmaddadp vs1, vs40, vs53 + xvmaddadp vs3, vs40, vs55 +.if \Complete==0 + lxv vs40, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs52, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B + lxv vs54, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index,\OffsetA) + addi \BREG, \BREG, DISP4(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index,32) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif +.endm + + +.macro KERNEL2x1 + LOAD2x1 + END2x1 AO, BO, 16,32 +.endm + + +.macro SAVE2x1 + add T1, CO ,LDC + SAVE1 vs0,vs1,CO,0 + SAVE1 vs2,vs3,T1,0 + addi CO, CO, 16 +.endm + +/********************************************************************************************** +* + +.macros for N=1 and M=8 +**********************************************************************************************/ + +.macro KERNEL1x8_ZERO_AND_PRIME_MMA + /* zero out and prime the MMA accumulators */ + xxsetaccz 0 + xxsetaccz 1 + xxsetaccz 2 + xxsetaccz 3 +.endm + + +.macro KERNEL1x8_2 Index,IsLast + lxvp vs32, DISP16(\Index, 0)(AO) // load real,imag from A + lxvp vs34, DISP16(\Index, 32)(AO) // load real,imag from A + lxvp vs36, DISP16(\Index, 64)(AO) // load real,imag from A + lxvp vs38, DISP16(\Index, 96)(AO) // load real,imag from A + lxvp vs40, DISP16(\Index, 128)(AO) // load real,imag from A + lxvp vs42, DISP16(\Index, 160)(AO) // load real,imag from A + lxvp vs44, DISP16(\Index, 192)(AO) // load real,imag from A + lxvp vs46, DISP16(\Index, 224)(AO) // load real,imag from A + lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 2, vs36, vs49 + xvf64gerpp 3, vs38, vs49 + xvf64gerpp 0, vs40, vs48 + xvf64gerpp 1, vs42, vs48 + xvf64gerpp 2, vs44, vs48 + xvf64gerpp 3, vs46, vs48 +.if \IsLast==1 + addi AO, AO, DISP16(\Index,256) + addi BO, BO, DISP2(\Index,32) +.endif +.endm + + +.macro LOAD_END_1x8 OffsetA,OffsetB + lxvp vs32, 0(AO) // load real,imag from A + lxvp vs34, 32(AO) // load real,imag from A + lxvp vs36, 64(AO) // load real,imag from A + lxvp vs38, 96(AO) // load real,imag from A + lxv vs48, 0(BO) // load real imag from B + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 1, vs34, vs48 + xvf64gerpp 2, vs36, vs48 + xvf64gerpp 3, vs38, vs48 + addi BO, BO, \OffsetB + addi AO, AO, \OffsetA +.endm + + +.macro KERNEL1x8_UNPRIME_MMA + /* "unprime" MMA accumulators */ + xxmfacc 0 + xxmfacc 1 + xxmfacc 2 + xxmfacc 3 +.endm + + +.macro SAVE1x8 + xxpermdi vs32, vs0, vs1, 0b01 + xxpermdi vs33, vs0, vs1, 0b10 + xxpermdi vs34, vs2, vs3, 0b01 + xxpermdi vs35, vs2, vs3, 0b10 + xxpermdi vs36, vs4, vs5, 0b01 + xxpermdi vs37, vs4, vs5, 0b10 + xxpermdi vs38, vs6, vs7, 0b01 + xxpermdi vs39, vs6, vs7, 0b10 + xxpermdi vs40, vs8, vs9, 0b01 + xxpermdi vs41, vs8, vs9, 0b10 + xxpermdi vs42, vs10, vs11, 0b01 + xxpermdi vs43, vs10, vs11, 0b10 + xxpermdi vs44, vs12, vs13, 0b01 + xxpermdi vs45, vs12, vs13, 0b10 + xxpermdi vs46, vs14, vs15, 0b01 + xxpermdi vs47, vs14, vs15, 0b10 + + xxlor vs2, vs32, vs32 + xxlor vs3, vs33, vs33 + xxlor vs0, vs34, vs34 + xxlor vs1, vs35, vs35 + xxlor vs6, vs36, vs36 + xxlor vs7, vs37, vs37 + xxlor vs4, vs38, vs38 + xxlor vs5, vs39, vs39 + xxlor vs10, vs40, vs40 + xxlor vs11, vs41, vs41 + xxlor vs8, vs42, vs42 + xxlor vs9, vs43, vs43 + xxlor vs14, vs44, vs44 + xxlor vs15, vs45, vs45 + xxlor vs12, vs46, vs46 + xxlor vs13, vs47, vs47 + + SAVE8 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0 + addi CO, CO, 128 +.endm + +/********************************************************************************************** +* + +.macros for N=1 and M=4 +**********************************************************************************************/ + +.macro KERNEL1x4_ZERO_AND_PRIME_MMA + /* zero out and prime the MMA accumulators */ + xxsetaccz 0 + xxsetaccz 1 +.endm + + +.macro KERNEL1x4_2 Index,IsLast + lxvp vs32, DISP8(\Index, 0)(AO) // load real,imag from A + lxvp vs34, DISP8(\Index, 32)(AO) // load real,imag from A + lxvp vs40, DISP8(\Index, 64)(AO) // load real,imag from A + lxvp vs42, DISP8(\Index, 96)(AO) // load real,imag from A + lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 0, vs40, vs48 + xvf64gerpp 1, vs42, vs48 +.if \IsLast==1 + addi AO, AO, DISP8(\Index,128) + addi BO, BO, DISP2(\Index,32) +.endif +.endm + + +.macro LOAD_END_1x4 OffsetA,OffsetB + lxvp vs32, 0(AO) // load real,imag from A + lxvp vs34, 32(AO) // load real,imag from A + lxv vs48, 0(BO) // load real imag from B + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 1, vs34, vs48 + addi BO, BO, \OffsetB + addi AO, AO, \OffsetA +.endm + + +.macro KERNEL1x4_UNPRIME_MMA + /* "unprime" MMA accumulators */ + xxmfacc 0 + xxmfacc 1 +.endm + + +.macro SAVE1x4 + xxpermdi vs32, vs0, vs1, 0b01 + xxpermdi vs33, vs0, vs1, 0b10 + xxpermdi vs34, vs2, vs3, 0b01 + xxpermdi vs35, vs2, vs3, 0b10 + xxpermdi vs36, vs4, vs5, 0b01 + xxpermdi vs37, vs4, vs5, 0b10 + xxpermdi vs38, vs6, vs7, 0b01 + xxpermdi vs39, vs6, vs7, 0b10 + + xxlor vs2, vs32, vs32 + xxlor vs3, vs33, vs33 + xxlor vs0, vs34, vs34 + xxlor vs1, vs35, vs35 + xxlor vs6, vs36, vs36 + xxlor vs7, vs37, vs37 + xxlor vs4, vs38, vs38 + xxlor vs5, vs39, vs39 + + SAVE4 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0 + addi CO, CO, 64 +.endm + +/********************************************************************************************** +* + +.macros for N=1 and M=2 +**********************************************************************************************/ + +.macro KERNEL1x2_ZERO_AND_PRIME_MMA + /* zero out and prime the MMA accumulators */ + xxsetaccz 0 +.endm + + +.macro KERNEL1x2_2 Index,IsLast + lxvp vs32, DISP4(\Index, 0)(AO) // load real,imag from A + lxvp vs40, DISP4(\Index, 32)(AO) // load real,imag from A + lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 0, vs40, vs48 +.if \IsLast==1 + addi AO, AO, DISP4(\Index,64) + addi BO, BO, DISP2(\Index,32) +.endif +.endm + + +.macro LOAD_END_1x2 OffsetA,OffsetB + lxvp vs32, 0(AO) // load real,imag from A + lxv vs48, 0(BO) // load real imag from B + xvf64gerpp 0, vs32, vs48 + addi BO, BO, \OffsetB + addi AO, AO, \OffsetA +.endm + + +.macro KERNEL1x2_UNPRIME_MMA + /* "unprime" MMA accumulators */ + xxmfacc 0 +.endm + + +.macro SAVE1x2 + xxpermdi vs32, vs0, vs1, 0b01 + xxpermdi vs33, vs0, vs1, 0b10 + xxpermdi vs34, vs2, vs3, 0b01 + xxpermdi vs35, vs2, vs3, 0b10 + + xxlor vs2, vs32, vs32 + xxlor vs3, vs33, vs33 + xxlor vs0, vs34, vs34 + xxlor vs1, vs35, vs35 + + SAVE2 vs0,vs1,vs2,vs3,CO,0 + addi CO, CO, 32 +.endm + +/********************************************************************************************** +* + +.macros for N=1 and M=1 +**********************************************************************************************/ + +.macro ZERO1x1 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +.endm + + +.macro LOAD1x1 + LOAD1x1O 0,0 +.endm + + +.macro LOAD1x1O OffsetA,OffsetB + lxv vs48,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs32, (0+\OffsetA)(AO) // load real,imag from A + xxswapd vs49, vs48 + +.endm + + +.macro END1x1_WITHOUT_ADD + END1x1 AO,BO,0,0 +.endm + + +.macro END1x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddadp vs0, vs32, vs48 + xvmaddadp vs1, vs32, vs49 +.endm + + +.macro LOAD1x1_2 + LOAD1x1_2O 0,0 +.endm + + +.macro LOAD1x1_2O OffsetA,OffsetB + lxv vs48,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs52, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs49, vs48 + + lxv vs32, (0+\OffsetA)(AO) // load real,imag from A + lxv vs40, (16+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END1x1_2 + /*for load2 offset will be 32 and 32*/ + KERNEL1x1_2 AO,BO, 32,32,0 ,1,1 +.endm + + + +.macro KERNEL1x1_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL1x1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xxswapd vs53, vs52 + xvmaddadp vs0, vs32, vs48 + xvmaddadp vs1, vs32, vs49 +.if \Complete==0 + lxv vs32, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs48, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B +.endif +.if \Complete==0 + xxswapd vs49, vs48 +.endif + xvmaddadp vs0, vs40, vs52 + xvmaddadp vs1, vs40, vs53 +.if \Complete==0 + lxv vs40, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs52, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index,\OffsetA) + addi \BREG, \BREG, DISP2(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index,32) + addi \BREG, \BREG, DISP2(\Index,32) +.endif +.endif +.endm + + + +.macro KERNEL1x1 + LOAD1x1 + END1x1 AO, BO, 16,16 +.endm + + + +.macro SAVE1x1 + SAVE1 vs0,vs1,CO,0 + addi CO, CO, 16 +.endm + +/****************************TRMM POINTER REFRESH + +.macroSES*************************/ + + +.macro SHIFT_REG REG1,REG2,SHIFT_VAL + .if \SHIFT_VAL==16 + slwi \REG1, \REG2, 8 + .elseif \SHIFT_VAL==8 + slwi \REG1, \REG2, 7 + .elseif \SHIFT_VAL==4 + slwi \REG1, \REG2, 6 + .elseif \SHIFT_VAL==2 + slwi \REG1, \REG2, 5 + .elseif \SHIFT_VAL==1 + slwi \REG1, \REG2, 4 + .endif +.endm +/* +//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// ptrbb = bb; +// #else +// ptrba += off*16; +// ptrbb = bb + off*2; +// #endif +*/ + + +.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B + #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /* ptrbb = bb;*/ + mr \PTR_B,\B_VAL /* refresh BPOINT */ + #else + /* + // ptrba =ptrba+ off*C_A; + // ptrbb = bb + off*C_B; + */ + SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ + SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ + add \PTR_B, \B_VAL , T4 /* Add values to BO */ + add \PTR_A, \PTR_A, T2 /* Add values to AO */ + #endif +.endm + +/* +// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +// temp = bk-off; +// #elif defined(LEFT) +// temp = off+16; // number of values in A +// #else +// temp = off+2; // number of values in B +// #endif +*/ + + +.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + /* temp = bk-off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + #elif defined(LEFT) + /* temp = off+INCR_A; // number of values in A */ + addi \TEMP_BK, \OFF_VAL, \INCR_A + #else + /* temp = off+INCR_B // number of values in B*/ + addi \TEMP_BK,\OFF_VAL, \INCR_B + #endif +.endm +/* +// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// temp = bk - off; +// #ifdef LEFT +// temp -= 16; // number of values in A +// #else +// temp -= 2; // number of values in B +// #endif +// ptrba += temp*16; +// ptrbb += temp*2; +// #endif +// #ifdef LEFT +// off += 16; // number of values in A +// #endif +*/ + + + +.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B + #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /*temp = bk - off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + #ifdef LEFT + /*temp -= 8; // number of values in A*/ + addi \TEMP_BK,\TEMP_BK,-\C_A + #else + /*temp -= 4; // number of values in B*/ + addi \TEMP_BK,\TEMP_BK,-\C_B + #endif + /*ptrba += temp*C_A; + ptrbb += temp*C_B;*/ + SHIFT_REG T4,\TEMP_BK,\C_A + SHIFT_REG T2,\TEMP_BK,\C_B + add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ + add \PTR_B, \PTR_B,T2 + #endif + #ifdef LEFT + /*off += 8; // number of values in A*/ + addi \OFF_VAL,\OFF_VAL,\C_A + #endif +.endm + diff --git a/kernel/power/zgemv_n_4.c b/kernel/power/zgemv_n_4.c index ba019d6a5..1f7199c89 100644 --- a/kernel/power/zgemv_n_4.c +++ b/kernel/power/zgemv_n_4.c @@ -29,6 +29,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "common.h" +#if defined(__VEC__) || defined(__ALTIVEC__) + #define HAVE_KERNEL_4x4_VEC 1 #define HAVE_KERNEL_4x2_VEC 1 #define HAVE_KERNEL_4x1_VEC 1 @@ -37,6 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC) #include #endif +#endif // #define NBMAX 4096 diff --git a/kernel/power/zgemv_t_4.c b/kernel/power/zgemv_t_4.c index b34199af6..4ed27d96b 100644 --- a/kernel/power/zgemv_t_4.c +++ b/kernel/power/zgemv_t_4.c @@ -28,10 +28,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #define NBMAX 4096 +#if defined(__VEC__) || defined(__ALTIVEC__) + #define HAVE_KERNEL_4x4_VEC 1 #define HAVE_KERNEL_4x2_VEC 1 #define HAVE_KERNEL_4x1_VEC 1 +#endif #if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC) #include #endif diff --git a/kernel/power/zrot.c b/kernel/power/zrot.c index c6d666178..5e7ca3b23 100644 --- a/kernel/power/zrot.c +++ b/kernel/power/zrot.c @@ -24,6 +24,9 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +#if !defined(__VEC__) || !defined(__ALTIVEC__) +#include "../arm/zrot.c" +#else #include "common.h" @@ -262,4 +265,4 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT } - \ No newline at end of file +#endif diff --git a/kernel/power/zscal.c b/kernel/power/zscal.c index a1b441d2c..5526f4d67 100644 --- a/kernel/power/zscal.c +++ b/kernel/power/zscal.c @@ -38,11 +38,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma GCC optimize "O1" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #if defined(DOUBLE) #include "zscal_microk_power8.c" #endif #endif +#endif #ifndef HAVE_KERNEL_8 diff --git a/kernel/power/zswap.c b/kernel/power/zswap.c index 1d8826f41..3a5a8eb83 100644 --- a/kernel/power/zswap.c +++ b/kernel/power/zswap.c @@ -36,9 +36,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "zswap_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_16 diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index b7cf0f112..582a1dc01 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -53,6 +53,7 @@ gotoblas_t TABLE_NAME = { GEMM_DEFAULT_OFFSET_A, GEMM_DEFAULT_OFFSET_B, GEMM_DEFAULT_ALIGN, +#ifdef BUILD_HALF 0, 0, 0, SHGEMM_DEFAULT_UNROLL_M, SHGEMM_DEFAULT_UNROLL_N, #ifdef SHGEMM_DEFAULT_UNROLL_MN @@ -109,7 +110,7 @@ gotoblas_t TABLE_NAME = { #else NULL,NULL, #endif - +#endif 0, 0, 0, SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N, @@ -134,6 +135,11 @@ gotoblas_t TABLE_NAME = { sgemv_nTS, sgemv_tTS, sger_kTS, ssymv_LTS, ssymv_UTS, +#ifdef ARCH_X86_64 + sgemm_directTS, + sgemm_direct_performantTS, +#endif + sgemm_kernelTS, sgemm_betaTS, #if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N sgemm_incopyTS, sgemm_itcopyTS, @@ -706,19 +712,25 @@ gotoblas_t TABLE_NAME = { #if defined(ARCH_ARM64) static void init_parameter(void) { +#if defined(BUILD_HALF) TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P; +#endif TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#if defined(BUILD_HALF) TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; +#endif TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; +#if defined(BUILD_HALF) TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; +#endif TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R; @@ -782,20 +794,26 @@ static void init_parameter(void) { #if defined(ARCH_POWER) static void init_parameter(void) { +#ifdef BUILD_HALF TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P; +#endif TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#ifdef BUILD_HALF TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; +#endif TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R; TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; +#ifdef BUILD_HALF TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; +#endif TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; @@ -805,20 +823,26 @@ static void init_parameter(void) { #if defined(ARCH_ZARCH) static void init_parameter(void) { +#ifdef BUILD_HALF TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P; +#endif TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#ifdef BUILD_HALF TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; +#endif TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R; TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; +#ifdef BUILD_HALF TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; +#endif TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; @@ -958,9 +982,11 @@ static void init_parameter(void) { (void) l2; /* dirty trick to suppress unused variable warning for targets */ /* where the GEMM unrolling parameters do not depend on l2 */ +#ifdef BUILD_HALF TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P; TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; +#endif TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; @@ -1145,7 +1171,7 @@ static void init_parameter(void) { #endif #endif -#ifdef SKYLAKEX +#if defined (SKYLAKEX) || defined (COOPERLAKE) #ifdef DEBUG fprintf(stderr, "SkylakeX\n"); diff --git a/kernel/x86/trsm_kernel_LN_2x4_penryn.S b/kernel/x86/trsm_kernel_LN_2x4_penryn.S index 34653d400..fde9eba8e 100644 --- a/kernel/x86/trsm_kernel_LN_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_LN_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LN_4x4_penryn.S b/kernel/x86/trsm_kernel_LN_4x4_penryn.S index 492f34344..fddf7560f 100644 --- a/kernel/x86/trsm_kernel_LN_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_LN_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LT_2x4_penryn.S b/kernel/x86/trsm_kernel_LT_2x4_penryn.S index 6840c54ad..33afd2a61 100644 --- a/kernel/x86/trsm_kernel_LT_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_LT_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LT_4x4_penryn.S b/kernel/x86/trsm_kernel_LT_4x4_penryn.S index e2f731fca..b05bd6ee5 100644 --- a/kernel/x86/trsm_kernel_LT_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_LT_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_RT_2x4_penryn.S b/kernel/x86/trsm_kernel_RT_2x4_penryn.S index 11825429e..f960559a6 100644 --- a/kernel/x86/trsm_kernel_RT_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_RT_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_RT_4x4_penryn.S b/kernel/x86/trsm_kernel_RT_4x4_penryn.S index 4c054f399..cf842c9b5 100644 --- a/kernel/x86/trsm_kernel_RT_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_RT_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S index e67496736..63c44c27a 100644 --- a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S index 498057697..4cb01e50a 100644 --- a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S @@ -63,7 +63,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S index f3072983d..09d5d8e43 100644 --- a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S index 879ae9c38..7d129e54c 100644 --- a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S @@ -63,7 +63,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S index 6c308197b..d33599317 100644 --- a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86_64/KERNEL.COOPERLAKE b/kernel/x86_64/KERNEL.COOPERLAKE new file mode 100644 index 000000000..0b2f3c0ed --- /dev/null +++ b/kernel/x86_64/KERNEL.COOPERLAKE @@ -0,0 +1 @@ +include $(KERNELDIR)/KERNEL.SKYLAKEX diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 65f031d03..9b8b84c30 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -14,7 +14,7 @@ STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c DGEMMKERNEL = dgemm_kernel_16x2_skylakex.c DTRMMKERNEL = dgemm_kernel_16x2_skylakex.c DGEMMINCOPY = ../generic/gemm_ncopy_16.c -DGEMMITCOPY = ../generic/gemm_tcopy_16.c +DGEMMITCOPY = dgemm_tcopy_16_skylakex.c DGEMMONCOPY = ../generic/gemm_ncopy_2.c DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c @@ -24,3 +24,6 @@ DGEMM_BETA = dgemm_beta_skylakex.c CGEMMKERNEL = cgemm_kernel_8x2_skylakex.c ZGEMMKERNEL = zgemm_kernel_4x2_skylakex.c + +CSCALKERNEL = ../arm/zscal.c +ZSCALKERNEL = ../arm/zscal.c diff --git a/kernel/x86_64/caxpy.c b/kernel/x86_64/caxpy.c index 586d05ac2..c19b98f02 100644 --- a/kernel/x86_64/caxpy.c +++ b/kernel/x86_64/caxpy.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "caxpy_microk_steamroller-2.c" #elif defined(BULLDOZER) #include "caxpy_microk_bulldozer-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined(SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) || defined(SKYLAKEX) || defined(COOPERLAKE) #include "caxpy_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "caxpy_microk_sandy-2.c" diff --git a/kernel/x86_64/cdot.c b/kernel/x86_64/cdot.c index 93fca0a0d..f2bf19dcd 100644 --- a/kernel/x86_64/cdot.c +++ b/kernel/x86_64/cdot.c @@ -34,7 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cdot_microk_bulldozer-2.c" #elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR) #include "cdot_microk_steamroller-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #include "cdot_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "cdot_microk_sandy-2.c" @@ -141,8 +141,8 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA i=0; ix=0; iy=0; - inc_x <<= 1; - inc_y <<= 1; + inc_x *= 2; + inc_y *= 2; while(i < n) { diff --git a/kernel/x86_64/cgemv_n_4.c b/kernel/x86_64/cgemv_n_4.c index d81766cd4..0ed02b8d8 100644 --- a/kernel/x86_64/cgemv_n_4.c +++ b/kernel/x86_64/cgemv_n_4.c @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "common.h" -#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #include "cgemv_n_microk_haswell-4.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "cgemv_n_microk_bulldozer-4.c" diff --git a/kernel/x86_64/cgemv_t_4.c b/kernel/x86_64/cgemv_t_4.c index 6bdea6787..c2903b11f 100644 --- a/kernel/x86_64/cgemv_t_4.c +++ b/kernel/x86_64/cgemv_t_4.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #include "cgemv_t_microk_haswell-4.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "cgemv_t_microk_bulldozer-4.c" @@ -233,9 +233,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, if ( m < 1 ) return(0); if ( n < 1 ) return(0); - inc_x <<= 1; - inc_y <<= 1; - lda <<= 1; + inc_x *= 2; + inc_y *= 2; + lda *= 2; lda4 = lda << 2; xbuffer = buffer; diff --git a/kernel/x86_64/cscal.c b/kernel/x86_64/cscal.c index 72af99809..6d75358a6 100644 --- a/kernel/x86_64/cscal.c +++ b/kernel/x86_64/cscal.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #include "cscal_microk_haswell-2.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) #include "cscal_microk_bulldozer-2.c" diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c index cde5bdaa6..d84c0c221 100644 --- a/kernel/x86_64/daxpy.c +++ b/kernel/x86_64/daxpy.c @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "daxpy_microk_piledriver-2.c" #elif defined(HASWELL) || defined(ZEN) #include "daxpy_microk_haswell-2.c" -#elif defined (SKYLAKEX) +#elif defined (SKYLAKEX) || defined (COOPERLAKE) #include "daxpy_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "daxpy_microk_sandy-2.c" diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c index 969357614..e4b6622e6 100644 --- a/kernel/x86_64/ddot.c +++ b/kernel/x86_64/ddot.c @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ddot_microk_nehalem-2.c" #elif defined(HASWELL) || defined(ZEN) #include "ddot_microk_haswell-2.c" -#elif defined (SKYLAKEX) +#elif defined (SKYLAKEX) || defined (COOPERLAKE) #include "ddot_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "ddot_microk_sandy-2.c" diff --git a/kernel/x86_64/dgemm_kernel_16x2_skylakex.c b/kernel/x86_64/dgemm_kernel_16x2_skylakex.c index 416ace59b..9f2bf24e2 100644 --- a/kernel/x86_64/dgemm_kernel_16x2_skylakex.c +++ b/kernel/x86_64/dgemm_kernel_16x2_skylakex.c @@ -54,40 +54,40 @@ #define kernel_kstart_n10(mdim,updk) "" #define kernel_kstart_n12(mdim,updk) "" #define kernel_kend_n4(mdim) "xorq %3,%3;"\ - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8)\ - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0)\ + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16) #define kernel_kend_n6(mdim) "xorq %3,%3;"\ - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8) acc_kend_nc3_k1m##mdim(0,8)\ - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) acc_kend_nc3_k1m##mdim(16,24)\ - loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32,40)\ - loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48,56) + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0) acc_kend_nc3_k1m##mdim(0)\ + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16) acc_kend_nc3_k1m##mdim(16)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48) #define kernel_kend_n8(mdim) "xorq %3,%3;"\ - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8) acc_kend_nc3_k1m##mdim(0,8) acc_kend_nc4_k1m##mdim(0,8)\ - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) acc_kend_nc3_k1m##mdim(16,24) acc_kend_nc4_k1m##mdim(16,24)\ - loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32,40) acc_kend_nc4_k1m##mdim(32,40)\ - loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48,56) acc_kend_nc4_k1m##mdim(48,56)\ - loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64,72)\ - loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80,88) + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0) acc_kend_nc3_k1m##mdim(0) acc_kend_nc4_k1m##mdim(0)\ + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16) acc_kend_nc3_k1m##mdim(16) acc_kend_nc4_k1m##mdim(16)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32) acc_kend_nc4_k1m##mdim(32)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48) acc_kend_nc4_k1m##mdim(48)\ + loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64)\ + loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80) #define kernel_kend_n10(mdim) "xorq %3,%3;"\ - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8) acc_kend_nc3_k1m##mdim(0,8) acc_kend_nc4_k1m##mdim(0,8) acc_kend_nc5_k1m##mdim(0,8)\ - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) acc_kend_nc3_k1m##mdim(16,24) acc_kend_nc4_k1m##mdim(16,24) acc_kend_nc5_k1m##mdim(16,24)\ - loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32,40) acc_kend_nc4_k1m##mdim(32,40) acc_kend_nc5_k1m##mdim(32,40)\ - loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48,56) acc_kend_nc4_k1m##mdim(48,56) acc_kend_nc5_k1m##mdim(48,56)\ - loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64,72) acc_kend_nc5_k1m##mdim(64,72)\ - loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80,88) acc_kend_nc5_k1m##mdim(80,88)\ - loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(96,104)\ - loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(112,120) + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0) acc_kend_nc3_k1m##mdim(0) acc_kend_nc4_k1m##mdim(0) acc_kend_nc5_k1m##mdim(0)\ + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16) acc_kend_nc3_k1m##mdim(16) acc_kend_nc4_k1m##mdim(16) acc_kend_nc5_k1m##mdim(16)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32) acc_kend_nc4_k1m##mdim(32) acc_kend_nc5_k1m##mdim(32)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48) acc_kend_nc4_k1m##mdim(48) acc_kend_nc5_k1m##mdim(48)\ + loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64) acc_kend_nc5_k1m##mdim(64)\ + loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80) acc_kend_nc5_k1m##mdim(80)\ + loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(96)\ + loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(112) #define kernel_kend_n12(mdim) "xorq %3,%3;"\ - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8) acc_kend_nc3_k1m##mdim(0,8) acc_kend_nc4_k1m##mdim(0,8) acc_kend_nc5_k1m##mdim(0,8) acc_kend_nc6_k1m##mdim(0,8)\ - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) acc_kend_nc3_k1m##mdim(16,24) acc_kend_nc4_k1m##mdim(16,24) acc_kend_nc5_k1m##mdim(16,24) acc_kend_nc6_k1m##mdim(16,24)\ - loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32,40) acc_kend_nc4_k1m##mdim(32,40) acc_kend_nc5_k1m##mdim(32,40) acc_kend_nc6_k1m##mdim(32,40)\ - loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48,56) acc_kend_nc4_k1m##mdim(48,56) acc_kend_nc5_k1m##mdim(48,56) acc_kend_nc6_k1m##mdim(48,56)\ - loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64,72) acc_kend_nc5_k1m##mdim(64,72) acc_kend_nc6_k1m##mdim(64,72)\ - loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80,88) acc_kend_nc5_k1m##mdim(80,88) acc_kend_nc6_k1m##mdim(80,88)\ - loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(96,104) acc_kend_nc6_k1m##mdim(96,104)\ - loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(112,120) acc_kend_nc6_k1m##mdim(112,120)\ - loada_kend_k1m##mdim acc_kend_nc6_k1m##mdim(128,136)\ - loada_kend_k1m##mdim acc_kend_nc6_k1m##mdim(144,152) + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0) acc_kend_nc3_k1m##mdim(0) acc_kend_nc4_k1m##mdim(0) acc_kend_nc5_k1m##mdim(0) acc_kend_nc6_k1m##mdim(0)\ + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16) acc_kend_nc3_k1m##mdim(16) acc_kend_nc4_k1m##mdim(16) acc_kend_nc5_k1m##mdim(16) acc_kend_nc6_k1m##mdim(16)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32) acc_kend_nc4_k1m##mdim(32) acc_kend_nc5_k1m##mdim(32) acc_kend_nc6_k1m##mdim(32)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48) acc_kend_nc4_k1m##mdim(48) acc_kend_nc5_k1m##mdim(48) acc_kend_nc6_k1m##mdim(48)\ + loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64) acc_kend_nc5_k1m##mdim(64) acc_kend_nc6_k1m##mdim(64)\ + loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80) acc_kend_nc5_k1m##mdim(80) acc_kend_nc6_k1m##mdim(80)\ + loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(96) acc_kend_nc6_k1m##mdim(96)\ + loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(112) acc_kend_nc6_k1m##mdim(112)\ + loada_kend_k1m##mdim acc_kend_nc6_k1m##mdim(128)\ + loada_kend_k1m##mdim acc_kend_nc6_k1m##mdim(144) #endif #else #define HEAD_SET_OFF(ndim) {} @@ -129,18 +129,28 @@ #define init_update_k(mdim) "" #define save_update_k(mdim) "" #endif - + #define KERNEL_h_k1m16n1 \ "vmovupd (%0),%%zmm1; vmovupd 64(%0),%%zmm2; addq $128,%0;"\ "vbroadcastsd (%1),%%zmm3; vfmadd231pd %%zmm1,%%zmm3,%%zmm8; vfmadd231pd %%zmm2,%%zmm3,%%zmm9;" #define KERNEL_k1m16n1 KERNEL_h_k1m16n1 "addq $8,%1;" -#define KERNEL_h_k1m16n2 KERNEL_h_k1m16n1\ +#ifdef BROADCAST_KERNEL + #define KERNEL_h_k1m16n2 KERNEL_h_k1m16n1\ "vbroadcastsd 8(%1),%%zmm4; vfmadd231pd %%zmm1,%%zmm4,%%zmm10; vfmadd231pd %%zmm2,%%zmm4,%%zmm11;" -#define KERNEL_k1m16n2 KERNEL_h_k1m16n2 "addq $16,%1;" -#define unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,boff1,boff2,...)\ + #define unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,boff1,...)\ "vbroadcastsd "#boff1"("#__VA_ARGS__"),%%zmm3; vfmadd231pd %%zmm1,%%zmm3,%%zmm"#c1_no"; vfmadd231pd %%zmm2,%%zmm3,%%zmm"#c2_no";"\ - "vbroadcastsd "#boff2"("#__VA_ARGS__"),%%zmm4; vfmadd231pd %%zmm1,%%zmm4,%%zmm"#c3_no"; vfmadd231pd %%zmm2,%%zmm4,%%zmm"#c4_no";" -#define unit_acc_m16n2(c1_no,c2_no,c3_no,c4_no,...) unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,0,8,__VA_ARGS__) + "vbroadcastsd "#boff1"+8("#__VA_ARGS__"),%%zmm4; vfmadd231pd %%zmm1,%%zmm4,%%zmm"#c3_no"; vfmadd231pd %%zmm2,%%zmm4,%%zmm"#c4_no";" + #define unit_acc_m16n2(c1_no,c2_no,c3_no,c4_no,...) unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,0,__VA_ARGS__) +#else + #define unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,boff1,...)\ + "vbroadcastf32x4 "#boff1"("#__VA_ARGS__"),%%zmm5; vfmadd231pd %%zmm1,%%zmm5,%%zmm"#c1_no"; vfmadd231pd %%zmm2,%%zmm5,%%zmm"#c2_no";"\ + "vfmadd231pd %%zmm3,%%zmm5,%%zmm"#c3_no"; vfmadd231pd %%zmm4,%%zmm5,%%zmm"#c4_no";" + #define unit_acc_m16n2(c1_no,c2_no,c3_no,c4_no,...) unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,0,__VA_ARGS__) + #define KERNEL_h_k1m16n2 \ + "vmovddup (%0),%%zmm1; vmovddup 8(%0),%%zmm2; vmovddup 64(%0),%%zmm3; vmovddup 72(%0),%%zmm4; addq $128,%0;"\ + unit_acc_m16n2(8,9,10,11,%1) +#endif +#define KERNEL_k1m16n2 KERNEL_h_k1m16n2 "addq $16,%1;" #define KERNEL_h_k1m16n4 KERNEL_h_k1m16n2 "prefetcht0 384(%0);" unit_acc_m16n2(12,13,14,15,%1,%%r12,1) #define KERNEL_k1m16n4 KERNEL_h_k1m16n4 "addq $16,%1;" #define KERNEL_k1m16n6 KERNEL_h_k1m16n4 unit_acc_m16n2(16,17,18,19,%1,%%r12,2) "addq $16,%1;" @@ -151,24 +161,42 @@ #define KERNEL_h_k1m16n12 KERNEL_h_k1m16n10 unit_acc_m16n2(28,29,30,31,%%r15,%%r12,2) #define KERNEL_k1m16n12 KERNEL_h_k1m16n12 "addq $16,%%r15;" #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0) + #ifdef BROADCAST_KERNEL #define loada_kend_k1m16 "vmovupd (%0,%3,1),%%zmm1; vmovupd 64(%0,%3,1),%%zmm2; addq $128,%3;" - #define acc_kend_nc2_k1m16(boff1,boff2) unit_acc_gen_m16n2(12,13,14,15,boff1,boff2,%1,%%r12,1) - #define acc_kend_nc3_k1m16(boff1,boff2) unit_acc_gen_m16n2(16,17,18,19,boff1,boff2,%1,%%r12,2) - #define acc_kend_nc4_k1m16(boff1,boff2) unit_acc_gen_m16n2(20,21,22,23,boff1,boff2,%%r15) - #define acc_kend_nc5_k1m16(boff1,boff2) unit_acc_gen_m16n2(24,25,26,27,boff1,boff2,%%r15,%%r12,1) - #define acc_kend_nc6_k1m16(boff1,boff2) unit_acc_gen_m16n2(28,29,30,31,boff1,boff2,%%r15,%%r12,2) + #else + #define loada_kend_k1m16 "vmovddup (%0,%3,1),%%zmm1; vmovddup 8(%0,%3,1),%%zmm2; vmovddup 64(%0,%3,1),%%zmm3; vmovddup 72(%0,%3,1),%%zmm4; addq $128,%3;" + #endif + #define acc_kend_nc2_k1m16(boff1) unit_acc_gen_m16n2(12,13,14,15,boff1,%1,%%r12,1) + #define acc_kend_nc3_k1m16(boff1) unit_acc_gen_m16n2(16,17,18,19,boff1,%1,%%r12,2) + #define acc_kend_nc4_k1m16(boff1) unit_acc_gen_m16n2(20,21,22,23,boff1,%%r15) + #define acc_kend_nc5_k1m16(boff1) unit_acc_gen_m16n2(24,25,26,27,boff1,%%r15,%%r12,1) + #define acc_kend_nc6_k1m16(boff1) unit_acc_gen_m16n2(28,29,30,31,boff1,%%r15,%%r12,2) #endif #define save_init_m16 "movq %2,%3; addq $128,%2;" #ifdef TRMMKERNEL #define SAVE_m16n1 "vmulpd %%zmm8,%%zmm0,%%zmm8; vmovupd %%zmm8,(%2); vmulpd %%zmm9,%%zmm0,%%zmm9; vmovupd %%zmm9,64(%2); addq $128,%2;" + #ifdef BROADCAST_KERNEL #define unit_save_m16n2(c1_no,c2_no,c3_no,c4_no)\ "vmulpd %%zmm"#c1_no",%%zmm0,%%zmm"#c1_no"; vmovupd %%zmm"#c1_no",(%3); vmulpd %%zmm"#c2_no",%%zmm0,%%zmm"#c2_no"; vmovupd %%zmm"#c2_no",64(%3);"\ "vmulpd %%zmm"#c3_no",%%zmm0,%%zmm"#c3_no"; vmovupd %%zmm"#c3_no",(%3,%4,1); vmulpd %%zmm"#c4_no",%%zmm0,%%zmm"#c4_no"; vmovupd %%zmm"#c4_no",64(%3,%4,1); leaq (%3,%4,2),%3;" + #else + #define unit_save_m16n2(c1_no,c2_no,c3_no,c4_no)\ + "vunpcklpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm1; vunpcklpd %%zmm"#c4_no",%%zmm"#c3_no",%%zmm2; vunpckhpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm3; vunpckhpd %%zmm"#c4_no",%%zmm"#c3_no",%%zmm4;"\ + "vmulpd %%zmm1,%%zmm0,%%zmm1; vmovupd %%zmm1,(%3); vmulpd %%zmm2,%%zmm0,%%zmm2; vmovupd %%zmm2,64(%3);"\ + "vmulpd %%zmm3,%%zmm0,%%zmm3; vmovupd %%zmm3,(%3,%4,1); vmulpd %%zmm4,%%zmm0,%%zmm4; vmovupd %%zmm4,64(%3,%4,1); leaq (%3,%4,2),%3;" + #endif #else #define SAVE_m16n1 "vfmadd213pd (%2),%%zmm0,%%zmm8; vmovupd %%zmm8,(%2); vfmadd213pd 64(%2),%%zmm0,%%zmm9; vmovupd %%zmm9,64(%2); addq $128,%2;" + #ifdef BROADCAST_KERNEL #define unit_save_m16n2(c1_no,c2_no,c3_no,c4_no)\ "vfmadd213pd (%3),%%zmm0,%%zmm"#c1_no"; vmovupd %%zmm"#c1_no",(%3); vfmadd213pd 64(%3),%%zmm0,%%zmm"#c2_no"; vmovupd %%zmm"#c2_no",64(%3);"\ "vfmadd213pd (%3,%4,1),%%zmm0,%%zmm"#c3_no"; vmovupd %%zmm"#c3_no",(%3,%4,1); vfmadd213pd 64(%3,%4,1),%%zmm0,%%zmm"#c4_no"; vmovupd %%zmm"#c4_no",64(%3,%4,1); leaq (%3,%4,2),%3;" + #else + #define unit_save_m16n2(c1_no,c2_no,c3_no,c4_no)\ + "vunpcklpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm1; vunpcklpd %%zmm"#c4_no",%%zmm"#c3_no",%%zmm2; vunpckhpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm3; vunpckhpd %%zmm"#c4_no",%%zmm"#c3_no",%%zmm4;"\ + "vfmadd213pd (%3),%%zmm0,%%zmm1; vmovupd %%zmm1,(%3); vfmadd213pd 64(%3),%%zmm0,%%zmm2; vmovupd %%zmm2,64(%3);"\ + "vfmadd213pd (%3,%4,1),%%zmm0,%%zmm3; vmovupd %%zmm3,(%3,%4,1); vfmadd213pd 64(%3,%4,1),%%zmm0,%%zmm4; vmovupd %%zmm4,64(%3,%4,1); leaq (%3,%4,2),%3;" + #endif #endif #define SAVE_m16n2 save_init_m16 unit_save_m16n2(8,9,10,11) #define SAVE_m16n4 SAVE_m16n2 unit_save_m16n2(12,13,14,15) @@ -206,11 +234,11 @@ #define KERNEL_k1m8n12 KERNEL_h_k1m8n12 "addq $16,%%r15;" #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0) #define loada_kend_k1m8 "vmovddup (%0,%3,1),%%zmm1; vmovddup 8(%0,%3,1),%%zmm2; addq $64,%3;" - #define acc_kend_nc2_k1m8(boff1,boff2) unit_acc_gen_m8n2(10,11,boff1,%1,%%r12,1) - #define acc_kend_nc3_k1m8(boff1,boff2) unit_acc_gen_m8n2(12,13,boff1,%1,%%r12,2) - #define acc_kend_nc4_k1m8(boff1,boff2) unit_acc_gen_m8n2(14,15,boff1,%%r15) - #define acc_kend_nc5_k1m8(boff1,boff2) unit_acc_gen_m8n2(16,17,boff1,%%r15,%%r12,1) - #define acc_kend_nc6_k1m8(boff1,boff2) unit_acc_gen_m8n2(18,19,boff1,%%r15,%%r12,2) + #define acc_kend_nc2_k1m8(boff1) unit_acc_gen_m8n2(10,11,boff1,%1,%%r12,1) + #define acc_kend_nc3_k1m8(boff1) unit_acc_gen_m8n2(12,13,boff1,%1,%%r12,2) + #define acc_kend_nc4_k1m8(boff1) unit_acc_gen_m8n2(14,15,boff1,%%r15) + #define acc_kend_nc5_k1m8(boff1) unit_acc_gen_m8n2(16,17,boff1,%%r15,%%r12,1) + #define acc_kend_nc6_k1m8(boff1) unit_acc_gen_m8n2(18,19,boff1,%%r15,%%r12,2) #endif #define save_init_m8 "movq %2,%3; addq $64,%2;" #ifdef TRMMKERNEL @@ -258,11 +286,11 @@ #define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $16,%%r15;" #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0) #define loada_kend_k1m4 "vmovddup (%0,%3,1),%%ymm1; vmovddup 8(%0,%3,1),%%ymm2; addq $32,%3;" - #define acc_kend_nc2_k1m4(boff1,boff2) unit_acc_gen_m4n2(6,7,boff1,%1,%%r12,1) - #define acc_kend_nc3_k1m4(boff1,boff2) unit_acc_gen_m4n2(8,9,boff1,%1,%%r12,2) - #define acc_kend_nc4_k1m4(boff1,boff2) unit_acc_gen_m4n2(10,11,boff1,%%r15) - #define acc_kend_nc5_k1m4(boff1,boff2) unit_acc_gen_m4n2(12,13,boff1,%%r15,%%r12,1) - #define acc_kend_nc6_k1m4(boff1,boff2) unit_acc_gen_m4n2(14,15,boff1,%%r15,%%r12,2) + #define acc_kend_nc2_k1m4(boff1) unit_acc_gen_m4n2(6,7,boff1,%1,%%r12,1) + #define acc_kend_nc3_k1m4(boff1) unit_acc_gen_m4n2(8,9,boff1,%1,%%r12,2) + #define acc_kend_nc4_k1m4(boff1) unit_acc_gen_m4n2(10,11,boff1,%%r15) + #define acc_kend_nc5_k1m4(boff1) unit_acc_gen_m4n2(12,13,boff1,%%r15,%%r12,1) + #define acc_kend_nc6_k1m4(boff1) unit_acc_gen_m4n2(14,15,boff1,%%r15,%%r12,2) #endif #define save_init_m4 "movq %2,%3; addq $32,%2;" #ifdef TRMMKERNEL @@ -311,11 +339,11 @@ #define KERNEL_k1m2n12 KERNEL_h_k1m2n12 "addq $16,%%r15;" #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0) #define loada_kend_k1m2 "vmovddup (%0,%3,1),%%xmm1; vmovddup 8(%0,%3,1),%%xmm2; addq $16,%3;" - #define acc_kend_nc2_k1m2(boff1,boff2) unit_acc_gen_m2n2(6,7,boff1,%1,%%r12,1) - #define acc_kend_nc3_k1m2(boff1,boff2) unit_acc_gen_m2n2(8,9,boff1,%1,%%r12,2) - #define acc_kend_nc4_k1m2(boff1,boff2) unit_acc_gen_m2n2(10,11,boff1,%%r15) - #define acc_kend_nc5_k1m2(boff1,boff2) unit_acc_gen_m2n2(12,13,boff1,%%r15,%%r12,1) - #define acc_kend_nc6_k1m2(boff1,boff2) unit_acc_gen_m2n2(14,15,boff1,%%r15,%%r12,2) + #define acc_kend_nc2_k1m2(boff1) unit_acc_gen_m2n2(6,7,boff1,%1,%%r12,1) + #define acc_kend_nc3_k1m2(boff1) unit_acc_gen_m2n2(8,9,boff1,%1,%%r12,2) + #define acc_kend_nc4_k1m2(boff1) unit_acc_gen_m2n2(10,11,boff1,%%r15) + #define acc_kend_nc5_k1m2(boff1) unit_acc_gen_m2n2(12,13,boff1,%%r15,%%r12,1) + #define acc_kend_nc6_k1m2(boff1) unit_acc_gen_m2n2(14,15,boff1,%%r15,%%r12,2) #endif #define save_init_m2 "movq %2,%3; addq $16,%2;" #ifdef TRMMKERNEL @@ -362,11 +390,11 @@ #define KERNEL_k1m1n12 KERNEL_h_k1m1n12 "addq $16,%%r15;" #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0) #define loada_kend_k1m1 "vmovddup (%0,%3,1),%%xmm1; addq $8,%3;" - #define acc_kend_nc2_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%1,%%r12,1),%%xmm1,%%xmm5;" - #define acc_kend_nc3_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%1,%%r12,2),%%xmm1,%%xmm6;" - #define acc_kend_nc4_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%%r15),%%xmm1,%%xmm7;" - #define acc_kend_nc5_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%%r15,%%r12,1),%%xmm1,%%xmm8;" - #define acc_kend_nc6_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%%r15,%%r12,2),%%xmm1,%%xmm9;" + #define acc_kend_nc2_k1m1(boff1) "vfmadd231pd "#boff1"(%1,%%r12,1),%%xmm1,%%xmm5;" + #define acc_kend_nc3_k1m1(boff1) "vfmadd231pd "#boff1"(%1,%%r12,2),%%xmm1,%%xmm6;" + #define acc_kend_nc4_k1m1(boff1) "vfmadd231pd "#boff1"(%%r15),%%xmm1,%%xmm7;" + #define acc_kend_nc5_k1m1(boff1) "vfmadd231pd "#boff1"(%%r15,%%r12,1),%%xmm1,%%xmm8;" + #define acc_kend_nc6_k1m1(boff1) "vfmadd231pd "#boff1"(%%r15,%%r12,2),%%xmm1,%%xmm9;" #endif #define save_init_m1 "movq %2,%3; addq $8,%2;" #ifdef TRMMKERNEL diff --git a/kernel/x86_64/dgemm_tcopy_16_skylakex.c b/kernel/x86_64/dgemm_tcopy_16_skylakex.c new file mode 100644 index 000000000..a1da60f8f --- /dev/null +++ b/kernel/x86_64/dgemm_tcopy_16_skylakex.c @@ -0,0 +1,129 @@ +#include +#include "common.h" +#include + +int CNAME(BLASLONG dim_second, BLASLONG dim_first, double *src, BLASLONG lead_dim, double *dst){ + double *src1, *src2, *src3, *src4, *dst1; + __m512d z1,z2,z3,z4,z5,z6,z7,z8; __m256d y1,y2,y3,y4; __m128d x1,x2,x3,x4; double s1,s2,s3,s4; + BLASLONG dim1_count, dim2_count, src_inc; + src_inc = 4 * lead_dim - dim_first; + src1 = src; src2 = src + lead_dim; src3 = src2 + lead_dim; src4 = src3 + lead_dim; + for(dim2_count=dim_second; dim2_count>3; dim2_count-=4){ + dst1 = dst + 16 * (dim_second - dim2_count); + for(dim1_count=dim_first; dim1_count>15; dim1_count-=16){ + z1 = _mm512_loadu_pd(src1); z2 = _mm512_loadu_pd(src1+8); src1 += 16; + z3 = _mm512_loadu_pd(src2); z4 = _mm512_loadu_pd(src2+8); src2 += 16; + z5 = _mm512_loadu_pd(src3); z6 = _mm512_loadu_pd(src3+8); src3 += 16; + z7 = _mm512_loadu_pd(src4); z8 = _mm512_loadu_pd(src4+8); src4 += 16; + _mm512_storeu_pd(dst1+ 0,z1); _mm512_storeu_pd(dst1+ 8,z2); + _mm512_storeu_pd(dst1+16,z3); _mm512_storeu_pd(dst1+24,z4); + _mm512_storeu_pd(dst1+32,z5); _mm512_storeu_pd(dst1+40,z6); + _mm512_storeu_pd(dst1+48,z7); _mm512_storeu_pd(dst1+56,z8); dst1 += 16 * dim_second; + } + dst1 -= 8 * (dim_second - dim2_count); + if(dim1_count>7){ + z1 = _mm512_loadu_pd(src1); src1 += 8; + z2 = _mm512_loadu_pd(src2); src2 += 8; + z3 = _mm512_loadu_pd(src3); src3 += 8; + z4 = _mm512_loadu_pd(src4); src4 += 8; + _mm512_storeu_pd(dst1+ 0,z1); _mm512_storeu_pd(dst1+ 8,z2); + _mm512_storeu_pd(dst1+16,z3); _mm512_storeu_pd(dst1+24,z4); dst1 += 8 * dim_second; + dim1_count -= 8; + } + dst1 -= 4 * (dim_second - dim2_count); + if(dim1_count>3){ + y1 = _mm256_loadu_pd(src1); src1 += 4; + y2 = _mm256_loadu_pd(src2); src2 += 4; + y3 = _mm256_loadu_pd(src3); src3 += 4; + y4 = _mm256_loadu_pd(src4); src4 += 4; + _mm256_storeu_pd(dst1+ 0,y1); _mm256_storeu_pd(dst1+ 4,y2); + _mm256_storeu_pd(dst1+ 8,y3); _mm256_storeu_pd(dst1+12,y4); dst1 += 4 * dim_second; + dim1_count -= 4; + } + dst1 -= 2 * (dim_second - dim2_count); + if(dim1_count>1){ + x1 = _mm_loadu_pd(src1); src1 += 2; + x2 = _mm_loadu_pd(src2); src2 += 2; + x3 = _mm_loadu_pd(src3); src3 += 2; + x4 = _mm_loadu_pd(src4); src4 += 2; + _mm_storeu_pd(dst1+0,x1); _mm_storeu_pd(dst1+2,x2); + _mm_storeu_pd(dst1+4,x3); _mm_storeu_pd(dst1+6,x4); dst1 += 2 * dim_second; + dim1_count -= 2; + } + dst1 -= dim_second - dim2_count; + if(dim1_count>0){ + s1 = *src1; src1++; s2 = *src2; src2++; s3 = *src3; src3++; s4 = *src4; src4++; + dst1[0] = s1; dst1[1] = s2; dst1[2] = s3; dst1[3] = s4; + } + src1 += src_inc; src2 += src_inc; src3 += src_inc; src4 += src_inc; + } + src_inc -= 2 * lead_dim; + for(; dim2_count>1; dim2_count-=2){ + dst1 = dst + 16 * (dim_second - dim2_count); + for(dim1_count=dim_first; dim1_count>15; dim1_count-=16){ + z1 = _mm512_loadu_pd(src1); z2 = _mm512_loadu_pd(src1+8); src1 += 16; + z3 = _mm512_loadu_pd(src2); z4 = _mm512_loadu_pd(src2+8); src2 += 16; + _mm512_storeu_pd(dst1+ 0,z1); _mm512_storeu_pd(dst1+ 8,z2); + _mm512_storeu_pd(dst1+16,z3); _mm512_storeu_pd(dst1+24,z4); dst1 += 16 * dim_second; + } + dst1 -= 8 * (dim_second - dim2_count); + if(dim1_count>7){ + z1 = _mm512_loadu_pd(src1); src1 += 8; + z2 = _mm512_loadu_pd(src2); src2 += 8; + _mm512_storeu_pd(dst1+ 0,z1); _mm512_storeu_pd(dst1+ 8,z2); dst1 += 8 * dim_second; + dim1_count -= 8; + } + dst1 -= 4 * (dim_second - dim2_count); + if(dim1_count>3){ + y1 = _mm256_loadu_pd(src1); src1 += 4; + y2 = _mm256_loadu_pd(src2); src2 += 4; + _mm256_storeu_pd(dst1+ 0,y1); _mm256_storeu_pd(dst1+ 4,y2); dst1 += 4 * dim_second; + dim1_count -= 4; + } + dst1 -= 2 * (dim_second - dim2_count); + if(dim1_count>1){ + x1 = _mm_loadu_pd(src1); src1 += 2; + x2 = _mm_loadu_pd(src2); src2 += 2; + _mm_storeu_pd(dst1+0,x1); _mm_storeu_pd(dst1+2,x2); dst1 += 2 * dim_second; + dim1_count -= 2; + } + dst1 -= dim_second - dim2_count; + if(dim1_count>0){ + s1 = *src1; src1++; s2 = *src2; src2++; + dst1[0] = s1; dst1[1] = s2; + } + src1 += src_inc; src2 += src_inc; + } + src_inc -= lead_dim; + for(; dim2_count>0; dim2_count--){ + dst1 = dst + 16 * (dim_second - dim2_count); + for(dim1_count=dim_first; dim1_count>15; dim1_count-=16){ + z1 = _mm512_loadu_pd(src1); z2 = _mm512_loadu_pd(src1+8); src1 += 16; + _mm512_storeu_pd(dst1+ 0,z1); _mm512_storeu_pd(dst1+ 8,z2); dst1 += 16 * dim_second; + } + dst1 -= 8 * (dim_second - dim2_count); + if(dim1_count>7){ + z1 = _mm512_loadu_pd(src1); src1 += 8; + _mm512_storeu_pd(dst1+ 0,z1); dst1 += 8 * dim_second; + dim1_count -= 8; + } + dst1 -= 4 * (dim_second - dim2_count); + if(dim1_count>3){ + y1 = _mm256_loadu_pd(src1); src1 += 4; + _mm256_storeu_pd(dst1+ 0,y1); dst1 += 4 * dim_second; + dim1_count -= 4; + } + dst1 -= 2 * (dim_second - dim2_count); + if(dim1_count>1){ + x1 = _mm_loadu_pd(src1); src1 += 2; + _mm_storeu_pd(dst1+0,x1); dst1 += 2 * dim_second; + dim1_count -= 2; + } + dst1 -= dim_second - dim2_count; + if(dim1_count>0){ + s1 = *src1; src1++; + dst1[0] = s1; + } + src1 += src_inc; + } +} diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c index 6d33641e9..da68db0cd 100644 --- a/kernel/x86_64/dgemv_n_4.c +++ b/kernel/x86_64/dgemv_n_4.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dgemv_n_microk_nehalem-4.c" #elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "dgemv_n_microk_haswell-4.c" -#elif defined (SKYLAKEX) +#elif defined (SKYLAKEX) || defined (COOPERLAKE) #include "dgemv_n_microk_skylakex-4.c" #endif diff --git a/kernel/x86_64/dgemv_t_4.c b/kernel/x86_64/dgemv_t_4.c index ed672a757..a3bf28dc8 100644 --- a/kernel/x86_64/dgemv_t_4.c +++ b/kernel/x86_64/dgemv_t_4.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined (SKYLAKEX) +#if defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined (SKYLAKEX) || defined (COOPERLAKE) #include "dgemv_t_microk_haswell-4.c" #endif diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c index e2436f789..d1270d20b 100644 --- a/kernel/x86_64/dscal.c +++ b/kernel/x86_64/dscal.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dscal_microk_sandy-2.c" #elif defined(HASWELL) || defined(ZEN) #include "dscal_microk_haswell-2.c" -#elif defined (SKYLAKEX) +#elif defined (SKYLAKEX) || defined (COOPERLAKE) #include "dscal_microk_skylakex-2.c" #endif diff --git a/kernel/x86_64/dsymv_L.c b/kernel/x86_64/dsymv_L.c index a722cc9df..573377ee0 100644 --- a/kernel/x86_64/dsymv_L.c +++ b/kernel/x86_64/dsymv_L.c @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dsymv_L_microk_bulldozer-2.c" #elif defined(HASWELL) || defined(ZEN) #include "dsymv_L_microk_haswell-2.c" -#elif defined (SKYLAKEX) +#elif defined (SKYLAKEX) || defined (COOPERLAKE) #include "dsymv_L_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "dsymv_L_microk_sandy-2.c" diff --git a/kernel/x86_64/dsymv_L_microk_skylakex-2.c b/kernel/x86_64/dsymv_L_microk_skylakex-2.c index bdcd914fb..f0df5aaa8 100644 --- a/kernel/x86_64/dsymv_L_microk_skylakex-2.c +++ b/kernel/x86_64/dsymv_L_microk_skylakex-2.c @@ -36,7 +36,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__clang_patchlevel__) && __clang_major__ == 9 && __clang_minor__ == 0 && __clang_patchlevel__ == 0 #pragma clang optimize off #endif - +#if defined(__apple_build_version__) && __clang_major__ == 11 && __clang_minor__ == 0 && __clang_patchlevel__ == 3 +#pragma clang optimize off +#endif static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) { @@ -164,6 +166,9 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL #if defined(__clang_patchlevel__) && __clang_major__ == 9 && __clang_minor__ == 0 && __clang_patchlevel__ == 0 #pragma clang optimize on #endif +#if defined(__apple_build_version__) && __clang_major__ == 11 && __clang_minor__ == 0 && __clang_patchlevel__ == 3 +#pragma clang optimize on +#endif #else #include "dsymv_L_microk_haswell-2.c" diff --git a/kernel/x86_64/dsymv_U.c b/kernel/x86_64/dsymv_U.c index 431e4bb3f..530ac8b1d 100644 --- a/kernel/x86_64/dsymv_U.c +++ b/kernel/x86_64/dsymv_U.c @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "dsymv_U_microk_bulldozer-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #include "dsymv_U_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "dsymv_U_microk_sandy-2.c" diff --git a/kernel/x86_64/saxpy.c b/kernel/x86_64/saxpy.c index e1349da58..7b2845636 100644 --- a/kernel/x86_64/saxpy.c +++ b/kernel/x86_64/saxpy.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "saxpy_microk_nehalem-2.c" #elif defined(HASWELL) || defined(ZEN) #include "saxpy_microk_haswell-2.c" -#elif defined (SKYLAKEX) +#elif defined (SKYLAKEX) || defined (COOPERLAKE) #include "saxpy_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "saxpy_microk_sandy-2.c" diff --git a/kernel/x86_64/sdot.c b/kernel/x86_64/sdot.c index 3536afc9e..e816c67e9 100644 --- a/kernel/x86_64/sdot.c +++ b/kernel/x86_64/sdot.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sdot_microk_nehalem-2.c" #elif defined(HASWELL) || defined(ZEN) #include "sdot_microk_haswell-2.c" -#elif defined (SKYLAKEX) +#elif defined (SKYLAKEX) || defined (COOPERLAKE) #include "sdot_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "sdot_microk_sandy-2.c" diff --git a/kernel/x86_64/sgemm_direct_performant.c b/kernel/x86_64/sgemm_direct_performant.c new file mode 100644 index 000000000..5a20ce395 --- /dev/null +++ b/kernel/x86_64/sgemm_direct_performant.c @@ -0,0 +1,30 @@ +#include "common.h" +/* helper for the direct sgemm code written by Arjan van der Ven */ + + + + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K) +{ + unsigned long long mnk = M * N * K; + /* large matrixes -> not performant */ + if (mnk >= 28 * 512 * 512) + return 0; + + /* + * if the B matrix is not a nice multiple if 4 we get many unaligned accesses, + * and the regular sgemm copy/realignment of data pays off much quicker + */ + if ((N & 3) != 0 && (mnk >= 8 * 512 * 512)) + return 0; + +#ifdef SMP + /* if we can run multithreaded, the threading changes the based threshold */ + if (mnk > 2 * 350 * 512 && num_cpu_avail(3)> 1) + return 0; +#endif + + return 1; +} + + diff --git a/kernel/x86_64/sgemm_direct_skylakex.c b/kernel/x86_64/sgemm_direct_skylakex.c index 0e8f1318f..a7cddbb3d 100644 --- a/kernel/x86_64/sgemm_direct_skylakex.c +++ b/kernel/x86_64/sgemm_direct_skylakex.c @@ -1,7 +1,7 @@ - +#if defined(SKYLAKEX) || defined (COOPERLAKE) /* the direct sgemm code written by Arjan van der Ven */ -//#include - +#include +#include "common.h" /* * "Direct sgemm" code. This code operates directly on the inputs and outputs * of the sgemm call, avoiding the copies, memory realignments and threading, @@ -38,6 +38,7 @@ #define MATMUL_SCALAR(N,M) result##N##M += Aval##M * Bval##N; #define STORE_SCALAR(N,M) R[(i+M) * strideR + j + N] = result##N##M; +#if 0 int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K) { unsigned long long mnk = M * N * K; @@ -61,9 +62,10 @@ int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K) return 1; } +#endif - -void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR) +//void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR) +void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR) { int i, j, k; @@ -465,3 +467,8 @@ void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict } } } +#else +#include "common.h" +void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR) +{} +#endif diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c index 3b1af33c1..f3d614242 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c @@ -512,4 +512,4 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f return 0; } #include -#include "sgemm_direct_skylakex.c" +//#include "sgemm_direct_skylakex.c" diff --git a/kernel/x86_64/sgemm_kernel_8x4_haswell_2.c b/kernel/x86_64/sgemm_kernel_8x4_haswell_2.c index 5ab3e6d1f..a2e78c58d 100644 --- a/kernel/x86_64/sgemm_kernel_8x4_haswell_2.c +++ b/kernel/x86_64/sgemm_kernel_8x4_haswell_2.c @@ -1,4 +1,4 @@ -/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store, %6 = b_pref */ +/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store, %6 = b_pref */ /* r10 = tmp, r11 = m_counter, r12 = k << 2(const), r13 = tmp, r14 = b_head_pos(const), r15 = tmp */ /* m = 8 *//* ymm0 for alpha, ymm1-ymm3 for temporary use, ymm4-ymm15 for accumulators */ diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index 63697970f..3eec21774 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_n_microk_nehalem-4.c" #elif defined(SANDYBRIDGE) #include "sgemv_n_microk_sandy-4.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #include "sgemv_n_microk_haswell-4.c" #endif diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index 86ecaf516..fe886f57f 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -34,7 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_t_microk_bulldozer-4.c" #elif defined(SANDYBRIDGE) #include "sgemv_t_microk_sandy-4.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #include "sgemv_t_microk_haswell-4.c" #endif diff --git a/kernel/x86_64/ssymv_L.c b/kernel/x86_64/ssymv_L.c index 73ae001ea..c9d698eb7 100644 --- a/kernel/x86_64/ssymv_L.c +++ b/kernel/x86_64/ssymv_L.c @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ssymv_L_microk_bulldozer-2.c" #elif defined(NEHALEM) #include "ssymv_L_microk_nehalem-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #include "ssymv_L_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "ssymv_L_microk_sandy-2.c" diff --git a/kernel/x86_64/ssymv_U.c b/kernel/x86_64/ssymv_U.c index f37c251a1..4d8aac1ab 100644 --- a/kernel/x86_64/ssymv_U.c +++ b/kernel/x86_64/ssymv_U.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ssymv_U_microk_bulldozer-2.c" #elif defined(NEHALEM) #include "ssymv_U_microk_nehalem-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #include "ssymv_U_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "ssymv_U_microk_sandy-2.c" diff --git a/kernel/x86_64/strsm_kernel_8x4_haswell_LN.c b/kernel/x86_64/strsm_kernel_8x4_haswell_LN.c index 4131debb1..5410bd4ae 100644 --- a/kernel/x86_64/strsm_kernel_8x4_haswell_LN.c +++ b/kernel/x86_64/strsm_kernel_8x4_haswell_LN.c @@ -1,4 +1,4 @@ -#include "common.h" +#include "common.h" #include #include "strsm_kernel_8x4_haswell_L_common.h" diff --git a/kernel/x86_64/strsm_kernel_8x4_haswell_L_common.h b/kernel/x86_64/strsm_kernel_8x4_haswell_L_common.h index cfa56da97..2862a5b8d 100644 --- a/kernel/x86_64/strsm_kernel_8x4_haswell_L_common.h +++ b/kernel/x86_64/strsm_kernel_8x4_haswell_L_common.h @@ -1,4 +1,4 @@ -/* r11 = m_counter, r12 = size_of_k_elements, r13 = kk, r14 = b_head, r15 = a_head */ +/* r11 = m_counter, r12 = size_of_k_elements, r13 = kk, r14 = b_head, r15 = a_head */ /* register i/o: %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc, %5 = k_counter */ /* memory input: %6 = K, %7 = offset, %8 = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}, %9 = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}, %10 = M */ diff --git a/kernel/x86_64/symv_L_sse.S b/kernel/x86_64/symv_L_sse.S index 8a5c44c9b..fea4fc746 100644 --- a/kernel/x86_64/symv_L_sse.S +++ b/kernel/x86_64/symv_L_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_L_sse2.S b/kernel/x86_64/symv_L_sse2.S index 0c40a3435..b853ef365 100644 --- a/kernel/x86_64/symv_L_sse2.S +++ b/kernel/x86_64/symv_L_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_U_sse.S b/kernel/x86_64/symv_U_sse.S index 7a2eeace5..bad367e91 100644 --- a/kernel/x86_64/symv_U_sse.S +++ b/kernel/x86_64/symv_U_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_U_sse2.S b/kernel/x86_64/symv_U_sse2.S index 0408b577c..147201751 100644 --- a/kernel/x86_64/symv_U_sse2.S +++ b/kernel/x86_64/symv_U_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zaxpy.c b/kernel/x86_64/zaxpy.c index 53866cf95..25e9f6d42 100644 --- a/kernel/x86_64/zaxpy.c +++ b/kernel/x86_64/zaxpy.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "zaxpy_microk_bulldozer-2.c" #elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "zaxpy_microk_steamroller-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #include "zaxpy_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "zaxpy_microk_sandy-2.c" diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c index 01169e8e6..1bc785ac1 100644 --- a/kernel/x86_64/zdot.c +++ b/kernel/x86_64/zdot.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "zdot_microk_bulldozer-2.c" #elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR) #include "zdot_microk_steamroller-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #include "zdot_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "zdot_microk_sandy-2.c" @@ -140,8 +140,8 @@ static void zdot_compute (BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLO i=0; ix=0; iy=0; - inc_x <<= 1; - inc_y <<= 1; + inc_x *= 2; + inc_y *= 2; while(i < n) { @@ -168,7 +168,7 @@ static void zdot_compute (BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLO #if defined(SMP) static int zdot_thread_function(BLASLONG n, BLASLONG dummy0, -BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y, +BLASLONG dummy1, FLOAT dummy2r, FLOAT dummy2i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *result, BLASLONG dummy3) { zdot_compute(n, x, inc_x, y, inc_y, (void *)result); diff --git a/kernel/x86_64/zgemv_n_4.c b/kernel/x86_64/zgemv_n_4.c index 0fedc496b..1f9d41859 100644 --- a/kernel/x86_64/zgemv_n_4.c +++ b/kernel/x86_64/zgemv_n_4.c @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #include "zgemv_n_microk_haswell-4.c" #elif defined(SANDYBRIDGE) #include "zgemv_n_microk_sandy-4.c" diff --git a/kernel/x86_64/zgemv_t_4.c b/kernel/x86_64/zgemv_t_4.c index 2ab7a671b..34f28b224 100644 --- a/kernel/x86_64/zgemv_t_4.c +++ b/kernel/x86_64/zgemv_t_4.c @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "zgemv_t_microk_bulldozer-4.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #include "zgemv_t_microk_haswell-4.c" #endif @@ -235,9 +235,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, if ( m < 1 ) return(0); if ( n < 1 ) return(0); - inc_x <<= 1; - inc_y <<= 1; - lda <<= 1; + inc_x *= 2; + inc_y *= 2; + lda <<= 1; lda4 = lda << 2; xbuffer = buffer; diff --git a/kernel/x86_64/zscal.c b/kernel/x86_64/zscal.c index 2a6d0e4c7..09a702a81 100644 --- a/kernel/x86_64/zscal.c +++ b/kernel/x86_64/zscal.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #include "zscal_microk_haswell-2.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) #include "zscal_microk_bulldozer-2.c" diff --git a/kernel/x86_64/zsymv_L_sse.S b/kernel/x86_64/zsymv_L_sse.S index e44bd7550..83ed41ba1 100644 --- a/kernel/x86_64/zsymv_L_sse.S +++ b/kernel/x86_64/zsymv_L_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_L_sse2.S b/kernel/x86_64/zsymv_L_sse2.S index e9f330c36..7ed2faf0f 100644 --- a/kernel/x86_64/zsymv_L_sse2.S +++ b/kernel/x86_64/zsymv_L_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_U_sse.S b/kernel/x86_64/zsymv_U_sse.S index 9f0dead18..5945f3f81 100644 --- a/kernel/x86_64/zsymv_U_sse.S +++ b/kernel/x86_64/zsymv_U_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_U_sse2.S b/kernel/x86_64/zsymv_U_sse2.S index b6106a37d..484d74f14 100644 --- a/kernel/x86_64/zsymv_U_sse2.S +++ b/kernel/x86_64/zsymv_U_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/zarch/KERNEL.Z14 b/kernel/zarch/KERNEL.Z14 index f6e3bec23..3510938a7 100644 --- a/kernel/zarch/KERNEL.Z14 +++ b/kernel/zarch/KERNEL.Z14 @@ -86,24 +86,24 @@ DGEMVTKERNEL = dgemv_t_4.c CGEMVTKERNEL = cgemv_t_4.c ZGEMVTKERNEL = zgemv_t_4.c -STRMMKERNEL = strmm8x4V.S -DTRMMKERNEL = trmm8x4V.S +STRMMKERNEL = gemm_vec.c +DTRMMKERNEL = gemm_vec.c CTRMMKERNEL = ctrmm4x4V.S ZTRMMKERNEL = ztrmm4x4V.S -SGEMMKERNEL = strmm8x4V.S -SGEMMINCOPY = ../generic/gemm_ncopy_8.c -SGEMMITCOPY = ../generic/gemm_tcopy_8.c -SGEMMONCOPY = ../generic/gemm_ncopy_4.c -SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMKERNEL = gemm_vec.c +ifneq ($(SGEMM_UNROLL_M),$(SGEMM_UNROLL_N)) +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) - - -DGEMMKERNEL = gemm8x4V.S +DGEMMKERNEL = gemm_vec.c DGEMMINCOPY = ../generic/gemm_ncopy_8.c DGEMMITCOPY = ../generic/gemm_tcopy_8.c DGEMMONCOPY = ../generic/gemm_ncopy_4.c @@ -145,7 +145,3 @@ ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - - - - diff --git a/kernel/zarch/gemm_vec.c b/kernel/zarch/gemm_vec.c new file mode 100644 index 000000000..741c09431 --- /dev/null +++ b/kernel/zarch/gemm_vec.c @@ -0,0 +1,710 @@ +/* + * Copyright (c) IBM Corporation 2020. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include "common.h" +#include + +#include +#include +#include + +#ifdef COMPLEX +#error "Handling for complex numbers is not supported in this kernel" +#endif + +#ifdef DOUBLE +#define UNROLL_M DGEMM_DEFAULT_UNROLL_M +#define UNROLL_N DGEMM_DEFAULT_UNROLL_N +#else +#define UNROLL_M SGEMM_DEFAULT_UNROLL_M +#define UNROLL_N SGEMM_DEFAULT_UNROLL_N +#endif + +static const size_t unroll_m = UNROLL_M; +static const size_t unroll_n = UNROLL_N; + +/* Handling of triangular matrices */ +#ifdef TRMMKERNEL +static const bool trmm = true; +static const bool left = +#ifdef LEFT + true; +#else + false; +#endif + +static const bool backwards = +#if defined(LEFT) != defined(TRANSA) + true; +#else + false; +#endif + +#else +static const bool trmm = false; +static const bool left = false; +static const bool backwards = false; +#endif /* TRMMKERNEL */ + +/* + * Background: + * + * The algorithm of GotoBLAS / OpenBLAS breaks down the matrix multiplication + * problem by splitting all matrices into partitions multiple times, so that the + * submatrices fit into the L1 or L2 caches. As a result, each multiplication of + * submatrices can stream data fast from L1 and L2 caches. Inbetween, it copies + * and rearranges the submatrices to enable contiguous memory accesses to + * improve locality in both caches and TLBs. + * + * At the heart of the algorithm is this kernel, which multiplies, a "Block + * matrix" A (small dimensions) with a "Panel matrix" B (number of rows is + * small) and adds the result into a "Panel matrix" C; GotoBLAS calls this + * operation GEBP. This kernel further partitions GEBP twice, such that (1) + * submatrices of C and B fit into the L1 caches (GEBP_column_block) and (2) a + * block of C fits into the registers, while multiplying panels from A and B + * streamed from the L2 and L1 cache, respectively (GEBP_block). + * + * + * Algorithm GEBP(A, B, C, m, n, k, alpha): + * + * The problem is calculating C += alpha * (A * B) + * C is an m x n matrix, A is an m x k matrix, B is an k x n matrix. + * + * - C is in column-major-order, with an offset of ldc to the element in the + * next column (same row). + * - A is in row-major-order yet stores SGEMM_UNROLL_M elements of each column + * contiguously while walking along rows. + * - B is in column-major-order but packs SGEMM_UNROLL_N elements of a row + * contiguously. + * If the numbers of rows and columns are not multiples of SGEMM_UNROLL_M or + * SGEMM_UNROLL_N, the remaining elements are arranged in blocks with power-of-2 + * dimensions (e.g., 5 remaining columns would be in a block-of-4 and a + * block-of-1). + * + * Note that packing A and B into that form is taken care of by the caller in + * driver/level3/level3.c (actually done by "copy kernels"). + * + * Steps: + * - Partition C and B into blocks of n_r (SGEMM_UNROLL_N) columns, C_j and B_j. + * Now, B_j should fit into the L1 cache. + * - For each partition, calculate C_j += alpha * (A * B_j) by + * (1) Calculate C_aux := A * B_j (see below) + * (2) unpack C_j = C_j + alpha * C_aux + * + * + * Algorithm for Calculating C_aux: + * + * - Further partition C_aux and A into groups of m_r (SGEMM_UNROLL_M) rows, + * such that the m_r x n_r-submatrix of C_aux can be held in registers. Each + * submatrix of C_aux can be calculated independently, and the registers are + * added back into C_j. + * + * - For each row-block of C_aux: + * (uses a row block of A and full B_j) + * - stream over all columns of A, multiply with elements from B and + * accumulate in registers. (use different inner-kernels to exploit + * vectorization for varying block sizes) + * - add alpha * row block of C_aux back into C_j. + * + * Note that there are additional mechanics for handling triangular matrices, + * calculating B := alpha (A * B) where either of the matrices A or B can be + * triangular. In case of A, the macro "LEFT" is defined. In addition, A can + * optionally be transposed. + * The code effectively skips an "offset" number of columns in A and rows of B + * in each block, to save unnecessary work by exploiting the triangular nature. + * To handle all cases, the code discerns (1) a "left" mode when A is triangular + * and (2) "forward" / "backwards" modes where only the first "offset" + * columns/rows of A/B are used or where the first "offset" columns/rows are + * skipped, respectively. + * + * Reference: + * + * The summary above is based on staring at various kernel implementations and: + * K. Goto and R. A. Van de Geijn, Anatomy of High-Performance Matrix + * Multiplication, in ACM Transactions of Mathematical Software, Vol. 34, No. + * 3, May 2008. + */ + +#define VLEN_BYTES 16 +#define VLEN_FLOATS (VLEN_BYTES / sizeof(FLOAT)) + +typedef FLOAT vector_float __attribute__ ((vector_size (16))); + +/** + * Load a vector into register, and hint on 8-byte alignment to improve + * performance. gcc-9 and newer will create these hints by itself. For older + * compiler versions, use inline assembly to explicitly express the hint. + * Provide explicit hex encoding to cater for binutils versions that do not know + * about vector-load with alignment hints yet. + * + * Note that, for block sizes where we apply vectorization, vectors in A will + * always be 8-byte aligned. + */ +static inline vector_float vec_load_hinted(FLOAT const *restrict a) { + vector_float const *restrict addr = (vector_float const *restrict)a; + vector_float y; + +#if __GNUC__ < 9 + // hex-encode vl %[out],%[addr],3 + asm(".insn vrx,0xe70000003006,%[out],%[addr],3" + : [ out ] "=v"(y) + : [ addr ] "R"(*addr)); +#else + y = *addr; +#endif + + return y; +} + +/** + * Calculate for a row-block in C_i of size ROWSxCOLS using vector intrinsics. + * + * @param[in] A Pointer current block of input matrix A. + * @param[in] k Number of columns in A. + * @param[in] B Pointer current block of input matrix B. + * @param[inout] C Pointer current block of output matrix C. + * @param[in] ldc Offset between elements in adjacent columns in C. + * @param[in] alpha Scalar factor. + */ +#define VECTOR_BLOCK(ROWS, COLS) \ + static inline void GEBP_block_##ROWS##_##COLS( \ + FLOAT const *restrict A, BLASLONG bk, FLOAT const *restrict B, \ + FLOAT *restrict C, BLASLONG ldc, FLOAT alpha) { \ + _Static_assert( \ + ROWS % VLEN_FLOATS == 0, \ + "rows in block must be multiples of vector length"); \ + vector_float Caux[ROWS / VLEN_FLOATS][COLS]; \ + \ + for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) { \ + vector_float A0 = \ + vec_load_hinted(A + i * VLEN_FLOATS); \ + for (BLASLONG j = 0; j < COLS; j++) \ + Caux[i][j] = A0 * B[j]; \ + } \ + \ + /* \ + * Stream over the row-block of A, which is packed \ + * column-by-column, multiply by coefficients in B and add up \ + * into temporaries Caux (which the compiler will hold in \ + * registers). Vectorization: Multiply column vectors from A \ + * with scalars from B and add up in column vectors of Caux. \ + * That equates to unrolling the loop over rows (in i) and \ + * executing each unrolled iteration as a vector element. \ + */ \ + for (BLASLONG k = 1; k < bk; k++) { \ + for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) { \ + vector_float Ak = vec_load_hinted( \ + A + i * VLEN_FLOATS + k * ROWS); \ + \ + for (BLASLONG j = 0; j < COLS; j++) \ + Caux[i][j] += Ak * B[j + k * COLS]; \ + } \ + } \ + \ + /* \ + * Unpack row-block of C_aux into outer C_i, multiply by \ + * alpha and add up. \ + */ \ + for (BLASLONG j = 0; j < COLS; j++) { \ + for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) { \ + vector_float *C_ij = \ + (vector_float *)(C + i * VLEN_FLOATS + \ + j * ldc); \ + if (trmm) { \ + *C_ij = alpha * Caux[i][j]; \ + } else { \ + *C_ij += alpha * Caux[i][j]; \ + } \ + } \ + } \ + } + + +#if UNROLL_M == 16 +VECTOR_BLOCK(16, 2) +VECTOR_BLOCK(16, 1) +#endif +#if UNROLL_N == 8 +VECTOR_BLOCK(8, 8) +VECTOR_BLOCK(4, 8) +#endif +#ifndef DOUBLE +VECTOR_BLOCK(8, 4) +#endif +VECTOR_BLOCK(8, 2) +VECTOR_BLOCK(8, 1) +VECTOR_BLOCK(4, 4) +VECTOR_BLOCK(4, 2) +VECTOR_BLOCK(4, 1) + +/** + * Calculate for a row-block in C_i of size ROWSxCOLS using scalar operations. + * Simple implementation for smaller block sizes + * + * @param[in] A Pointer current block of input matrix A. + * @param[in] k Number of columns in A. + * @param[in] B Pointer current block of input matrix B. + * @param[inout] C Pointer current block of output matrix C. + * @param[in] ldc Offset between elements in adjacent columns in C. + * @param[in] alpha Scalar factor. + */ +#define SCALAR_BLOCK(ROWS, COLS) \ + static inline void GEBP_block_##ROWS##_##COLS( \ + FLOAT const *restrict A, BLASLONG k, FLOAT const *restrict B, \ + FLOAT *restrict C, BLASLONG ldc, FLOAT alpha) { \ + FLOAT Caux[ROWS][COLS] __attribute__((aligned(16))); \ + \ + /* \ + * Peel off first iteration (i.e., column of A) for \ + * initializing Caux \ + */ \ + for (BLASLONG i = 0; i < ROWS; i++) \ + for (BLASLONG j = 0; j < COLS; j++) Caux[i][j] = A[i] * B[j]; \ + \ + for (BLASLONG kk = 1; kk < k; kk++) \ + for (BLASLONG i = 0; i < ROWS; i++) \ + for (BLASLONG j = 0; j < COLS; j++) \ + Caux[i][j] += A[i + kk * ROWS] * B[j + kk * COLS]; \ + \ + for (BLASLONG i = 0; i < ROWS; i++) \ + for (BLASLONG j = 0; j < COLS; j++) \ + if (trmm) { \ + C[i + j * ldc] = alpha * Caux[i][j]; \ + } else { \ + C[i + j * ldc] += alpha * Caux[i][j]; \ + } \ + } + +#ifdef DOUBLE +VECTOR_BLOCK(2, 4) +VECTOR_BLOCK(2, 2) +VECTOR_BLOCK(2, 1) +#else +SCALAR_BLOCK(2, 4) +SCALAR_BLOCK(2, 2) +SCALAR_BLOCK(2, 1) +#endif + +SCALAR_BLOCK(1, 4) +SCALAR_BLOCK(1, 2) +SCALAR_BLOCK(1, 1) + + +/** + * Calculate a row-block that fits 4x4 vector registers using a loop + * unrolled-by-2 with explicit interleaving to better overlap loads and + * computation. + * This function fits 16x4 blocks for SGEMM and 8x4 blocks for DGEMM. + */ +#ifdef DOUBLE +static inline void GEBP_block_8_4( +#else // float +static inline void GEBP_block_16_4( +#endif + FLOAT const *restrict A, BLASLONG bk, FLOAT const *restrict B, + FLOAT *restrict C, BLASLONG ldc, FLOAT alpha) { +#define VEC_ROWS 4 +#define VEC_COLS 4 +#define ROWS VEC_ROWS * VLEN_FLOATS +#define COLS (VEC_COLS) + + /* + * Hold intermediate results in vector registers. + * Since we need to force the compiler's hand in places, we need to use + * individual variables in contrast to the generic implementation's + * arrays. + */ +#define INIT_ROW_OF_C(ROW) \ + vector_float A##ROW = vec_load_hinted(A + ROW * VLEN_FLOATS); \ + vector_float C_##ROW##_0 = A##ROW * B[0]; \ + vector_float C_##ROW##_1 = A##ROW * B[1]; \ + vector_float C_##ROW##_2 = A##ROW * B[2]; \ + vector_float C_##ROW##_3 = A##ROW * B[3]; + + INIT_ROW_OF_C(0) + INIT_ROW_OF_C(1) + INIT_ROW_OF_C(2) + INIT_ROW_OF_C(3) +#undef INIT_ROW_OF_C + + if (bk > 1) { + BLASLONG k = 1; + vector_float Ak[VEC_ROWS], Aknext[VEC_ROWS]; + vector_float Bk[VEC_COLS], Bknext[VEC_COLS]; + + /* + * Note that in several places, we enforce an instruction + * sequence that we identified empirically by utilizing dummy + * asm statements. + */ + + for (BLASLONG j = 0; j < VEC_COLS; j++) + Bk[j] = vec_splats(B[j + k * COLS]); + asm(""); + + for (BLASLONG i = 0; i < VEC_ROWS; i++) + Ak[i] = vec_load_hinted(A + i * VLEN_FLOATS + k * ROWS); + + for (; k < (bk - 2); k += 2) { + /* + * Load inputs for (k+1) into registers. + * Loading from B first is advantageous. + */ + for (BLASLONG j = 0; j < VEC_COLS; j++) + Bknext[j] = vec_splats(B[j + (k + 1) * COLS]); + asm(""); + for (BLASLONG i = 0; i < VEC_ROWS; i++) + Aknext[i] = vec_load_hinted(A + i * VLEN_FLOATS + + (k + 1) * ROWS); + + /* + * To achieve better instruction-level parallelism, + * make sure to first load input data for (k+1) before + * initiating compute for k. We enforce that ordering + * with a pseudo asm statement. + * Note that we need to massage this particular "barrier" + * depending on the gcc version. + */ +#if __GNUC__ > 7 +#define BARRIER_READ_BEFORE_COMPUTE(SUFFIX) \ + do { \ + asm("" \ + : "+v"(C_0_0), "+v"(C_0_1), "+v"(C_0_2), "+v"(C_0_3), "+v"(C_1_0), \ + "+v"(C_1_1), "+v"(C_1_2), "+v"(C_1_3) \ + : "v"(B##SUFFIX[0]), "v"(B##SUFFIX[1]), "v"(B##SUFFIX[2]), \ + "v"(B##SUFFIX[3]), "v"(A##SUFFIX[0]), "v"(A##SUFFIX[1]), \ + "v"(A##SUFFIX[2]), "v"(A##SUFFIX[3])); \ + asm("" \ + : "+v"(C_2_0), "+v"(C_2_1), "+v"(C_2_2), "+v"(C_2_3), "+v"(C_3_0), \ + "+v"(C_3_1), "+v"(C_3_2), "+v"(C_3_3) \ + : "v"(B##SUFFIX[0]), "v"(B##SUFFIX[1]), "v"(B##SUFFIX[2]), \ + "v"(B##SUFFIX[3]), "v"(A##SUFFIX[0]), "v"(A##SUFFIX[1]), \ + "v"(A##SUFFIX[2]), "v"(A##SUFFIX[3])); \ + } while (0) +#else // __GNUC__ <= 7 +#define BARRIER_READ_BEFORE_COMPUTE(SUFFIX) \ + do { \ + asm(""); \ + } while (0) +#endif + + BARRIER_READ_BEFORE_COMPUTE(knext); + + /* Compute for (k) */ + C_0_0 += Ak[0] * Bk[0]; + C_1_0 += Ak[1] * Bk[0]; + C_2_0 += Ak[2] * Bk[0]; + C_3_0 += Ak[3] * Bk[0]; + + C_0_1 += Ak[0] * Bk[1]; + C_1_1 += Ak[1] * Bk[1]; + C_2_1 += Ak[2] * Bk[1]; + C_3_1 += Ak[3] * Bk[1]; + + C_0_2 += Ak[0] * Bk[2]; + C_1_2 += Ak[1] * Bk[2]; + C_2_2 += Ak[2] * Bk[2]; + C_3_2 += Ak[3] * Bk[2]; + + C_0_3 += Ak[0] * Bk[3]; + C_1_3 += Ak[1] * Bk[3]; + C_2_3 += Ak[2] * Bk[3]; + C_3_3 += Ak[3] * Bk[3]; + + asm(""); + + /* + * Load inputs for (k+2) into registers. + * First load from B. + */ + for (BLASLONG j = 0; j < VEC_COLS; j++) + Bk[j] = vec_splats(B[j + (k + 2) * COLS]); + asm(""); + for (BLASLONG i = 0; i < VEC_ROWS; i++) + Ak[i] = vec_load_hinted(A + i * VLEN_FLOATS + (k + 2) * ROWS); + + /* + * As above, make sure to first schedule the loads for (k+2) + * before compute for (k+1). + */ + BARRIER_READ_BEFORE_COMPUTE(k); + + /* Compute on (k+1) */ + C_0_0 += Aknext[0] * Bknext[0]; + C_1_0 += Aknext[1] * Bknext[0]; + C_2_0 += Aknext[2] * Bknext[0]; + C_3_0 += Aknext[3] * Bknext[0]; + + C_0_1 += Aknext[0] * Bknext[1]; + C_1_1 += Aknext[1] * Bknext[1]; + C_2_1 += Aknext[2] * Bknext[1]; + C_3_1 += Aknext[3] * Bknext[1]; + + C_0_2 += Aknext[0] * Bknext[2]; + C_1_2 += Aknext[1] * Bknext[2]; + C_2_2 += Aknext[2] * Bknext[2]; + C_3_2 += Aknext[3] * Bknext[2]; + + C_0_3 += Aknext[0] * Bknext[3]; + C_1_3 += Aknext[1] * Bknext[3]; + C_2_3 += Aknext[2] * Bknext[3]; + C_3_3 += Aknext[3] * Bknext[3]; + } + + /* Wrapup remaining k's */ + for (; k < bk; k++) { + vector_float Ak; + +#define COMPUTE_WRAPUP_ROW(ROW) \ + Ak = vec_load_hinted(A + ROW * VLEN_FLOATS + k * ROWS); \ + C_##ROW##_0 += Ak * B[0 + k * COLS]; \ + C_##ROW##_1 += Ak * B[1 + k * COLS]; \ + C_##ROW##_2 += Ak * B[2 + k * COLS]; \ + C_##ROW##_3 += Ak * B[3 + k * COLS]; + + COMPUTE_WRAPUP_ROW(0) + COMPUTE_WRAPUP_ROW(1) + COMPUTE_WRAPUP_ROW(2) + COMPUTE_WRAPUP_ROW(3) +#undef COMPUTE_WRAPUP_ROW + } + } + + /* + * Unpack row-block of C_aux into outer C_i, multiply by + * alpha and add up (or assign for TRMM). + */ +#define WRITE_BACK_C(ROW, COL) \ + do { \ + vector_float *Cij = \ + (vector_float *)(C + ROW * VLEN_FLOATS + COL * ldc); \ + if (trmm) { \ + *Cij = alpha * C_##ROW##_##COL; \ + } else { \ + *Cij += alpha * C_##ROW##_##COL; \ + } \ + } while (0) + + WRITE_BACK_C(0, 0); WRITE_BACK_C(0, 1); WRITE_BACK_C(0, 2); WRITE_BACK_C(0, 3); + WRITE_BACK_C(1, 0); WRITE_BACK_C(1, 1); WRITE_BACK_C(1, 2); WRITE_BACK_C(1, 3); + WRITE_BACK_C(2, 0); WRITE_BACK_C(2, 1); WRITE_BACK_C(2, 2); WRITE_BACK_C(2, 3); + WRITE_BACK_C(3, 0); WRITE_BACK_C(3, 1); WRITE_BACK_C(3, 2); WRITE_BACK_C(3, 3); +#undef WRITE_BACK_C + +#undef ROWS +#undef VEC_ROWS +#undef COLS +#undef VEC_COLS +#undef BARRIER_READ_BEFORE_COMPUTE +} + +/** + * Handle calculation for row blocks in C_i of any size by dispatching into + * macro-defined (inline) functions or by deferring to a simple generic + * implementation. Note that the compiler can remove this awkward-looking + * dispatching code while inlineing. + * + * @param[in] m Number of rows in block C_i. + * @param[in] n Number of columns in block C_i. + * @param[in] first_row Index of first row of the block C_i (relative to C). + * @param[in] A Pointer to input matrix A (note: all of it). + * @param[in] k Number of columns in A and rows in B. + * @param[in] B Pointer to current column block (panel) of input matrix B. + * @param[inout] C Pointer to current column block (panel) of output matrix C. + * @param[in] ldc Offset between elements in adjacent columns in C. + * @param[in] alpha Scalar factor. + * @param[in] offset Number of columns of A and rows of B to skip (for triangular matrices). + * @param[in] off Running offset for handling triangular matrices. + */ +static inline void GEBP_block(BLASLONG m, BLASLONG n, + BLASLONG first_row, + const FLOAT * restrict A, BLASLONG k, + const FLOAT * restrict B, + FLOAT *restrict C, BLASLONG ldc, + FLOAT alpha, + BLASLONG offset, BLASLONG off) +{ + if (trmm && left) + off = offset + first_row; + + A += first_row * k; + C += first_row; + + if (trmm) { + if (backwards) { + A += off * m; + B += off * n; + k -= off; + } else { + if (left) { + k = off + m; + } else { + k = off + n; + } + } + } + + /* Dispatch into the implementation for each block size: */ + +#define BLOCK(bm, bn) \ + if (m == bm && n == bn) { \ + GEBP_block_##bm##_##bn(A, k, B, C, ldc, alpha); \ + return; \ + } + +#if UNROLL_M == 16 + BLOCK(16, 4); BLOCK(16, 2); BLOCK(16, 1); +#endif +#if UNROLL_N == 8 + BLOCK(8, 8); BLOCK(4, 8); +#endif + BLOCK(8, 4); BLOCK(8, 2); BLOCK(8, 1); + BLOCK(4, 4); BLOCK(4, 2); BLOCK(4, 1); + + BLOCK(2, 4); BLOCK(2, 2); BLOCK(2, 1); + + BLOCK(1, 4); BLOCK(1, 2); BLOCK(1, 1); + +#undef BLOCK +} + +/** + * Handle a column block (panel) of C and B while calculating C += alpha(A * B). + * + * @param[in] num_cols Number of columns in the block (in C and B). + * @param[in] first_col First column of the current block (in C and B). + * @param[in] A Pointer to input matrix A. + * @param[in] bk Number of columns in A and rows in B. + * @param[in] B Pointer to input matrix B (note: all of it). + * @param[in] bm Number of rows in C and A. + * @param[inout] C Pointer to output matrix C (note: all of it). + * @param[in] ldc Offset between elements in adjacent columns in C. + * @param[in] alpha Scalar factor. + * @param[in] offset Number of columns of A and rows of B to skip (for triangular matrices). + */ +static inline void GEBP_column_block(BLASLONG num_cols, BLASLONG first_col, + const FLOAT *restrict A, BLASLONG bk, + const FLOAT *restrict B, BLASLONG bm, + FLOAT *restrict C, BLASLONG ldc, + FLOAT alpha, + BLASLONG const offset) { + + FLOAT *restrict C_i = C + first_col * ldc; + /* + * B is in column-order with n_r packed row elements, which does + * not matter -- we always move in full such blocks of + * column*pack + */ + const FLOAT *restrict B_i = B + first_col * bk; + + BLASLONG off = 0; + if (trmm) { + if (left) { + off = offset; + } else { + off = -offset + first_col; + } + } + + /* + * Calculate C_aux := A * B_j + * then unpack C_i += alpha * C_aux. + * + * For that purpose, further partition C_aux and A into blocks + * of m_r (unroll_m) rows, or powers-of-2 if smaller. + */ + BLASLONG row = 0; + for (BLASLONG block_size = unroll_m; block_size > 0; block_size /= 2) + for (; bm - row >= block_size; row += block_size) + GEBP_block(block_size, num_cols, row, A, bk, B_i, C_i, + ldc, alpha, offset, off); +} + +/** + * Inner kernel for matrix-matrix multiplication. C += alpha (A * B) + * where C is an m-by-n matrix, A is m-by-k and B is k-by-n. Note that A, B, and + * C are pointers to submatrices of the actual matrices. + * + * For triangular matrix multiplication, calculate B := alpha (A * B) where A + * or B can be triangular (in case of A, the macro LEFT will be defined). + * + * @param[in] bm Number of rows in C and A. + * @param[in] bn Number of columns in C and B. + * @param[in] bk Number of columns in A and rows in B. + * @param[in] alpha Scalar factor. + * @param[in] ba Pointer to input matrix A. + * @param[in] bb Pointer to input matrix B. + * @param[inout] C Pointer to output matrix C. + * @param[in] ldc Offset between elements in adjacent columns in C. + * @param[in] offset Number of columns of A and rows of B to skip (for triangular matrices). + * @returns 0 on success. + */ +int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, + FLOAT *restrict ba, FLOAT *restrict bb, + FLOAT *restrict C, BLASLONG ldc +#ifdef TRMMKERNEL + , BLASLONG offset +#endif + ) +{ + if ( (bm == 0) || (bn == 0) || (bk == 0) || (alpha == ZERO)) + return 0; + + /* + * interface code allocates buffers for ba and bb at page + * granularity (i.e., using mmap(MAP_ANONYMOUS), so enable the compiler + * to make use of the fact in vector load operations. + */ + ba = __builtin_assume_aligned(ba, 16); + bb = __builtin_assume_aligned(bb, 16); + + /* + * Use offset and off even when compiled as SGEMMKERNEL to simplify + * function signatures and function calls. + */ +#ifndef TRMMKERNEL + BLASLONG const offset = 0; +#endif + + /* + * Partition B and C into blocks of n_r (unroll_n) columns, called B_i + * and C_i. For each partition, calculate C_i += alpha * (A * B_j). + * + * For remaining columns that do not fill up a block of n_r, iteratively + * use smaller block sizes of powers of 2. + */ + BLASLONG col = 0; + for (BLASLONG block_size = unroll_n; block_size > 0; block_size /= 2) + for (; bn - col >= block_size; col += block_size) + GEBP_column_block(block_size, col, ba, bk, bb, bm, C, ldc, alpha, offset); + + return 0; +} diff --git a/lapack-netlib/LAPACKE/include/lapacke.h b/lapack-netlib/LAPACKE/include/lapacke.h index 6eb0b696b..012c104bb 100644 --- a/lapack-netlib/LAPACKE/include/lapacke.h +++ b/lapack-netlib/LAPACKE/include/lapacke.h @@ -12575,7 +12575,7 @@ lapack_int LAPACKE_zhetrs_aa_2stage_work( int matrix_layout, char uplo, lapack_i /* APIs for set/get nancheck flags */ void LAPACKE_set_nancheck( int flag ); -int LAPACKE_get_nancheck( ); +int LAPACKE_get_nancheck( void ); #ifdef __cplusplus } diff --git a/lapack-netlib/SRC/clargv.f b/lapack-netlib/SRC/clargv.f index ba53cae6f..36c5108df 100644 --- a/lapack-netlib/SRC/clargv.f +++ b/lapack-netlib/SRC/clargv.f @@ -200,7 +200,7 @@ FS = FS*SAFMN2 GS = GS*SAFMN2 SCALE = SCALE*SAFMN2 - IF( SCALE.GE.SAFMX2 ) + IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20 ) $ GO TO 10 ELSE IF( SCALE.LE.SAFMN2 ) THEN IF( G.EQ.CZERO ) THEN diff --git a/lapack-netlib/SRC/clartg.f b/lapack-netlib/SRC/clartg.f index da9a1cdef..baa68b657 100644 --- a/lapack-netlib/SRC/clartg.f +++ b/lapack-netlib/SRC/clartg.f @@ -161,7 +161,7 @@ FS = FS*SAFMN2 GS = GS*SAFMN2 SCALE = SCALE*SAFMN2 - IF( SCALE.GE.SAFMX2 ) + IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20) $ GO TO 10 ELSE IF( SCALE.LE.SAFMN2 ) THEN IF( G.EQ.CZERO.OR.SISNAN( ABS( G ) ) ) THEN diff --git a/lapack-netlib/SRC/dlartg.f b/lapack-netlib/SRC/dlartg.f index 1c7c46f63..dc49986a0 100644 --- a/lapack-netlib/SRC/dlartg.f +++ b/lapack-netlib/SRC/dlartg.f @@ -163,7 +163,7 @@ F1 = F1*SAFMN2 G1 = G1*SAFMN2 SCALE = MAX( ABS( F1 ), ABS( G1 ) ) - IF( SCALE.GE.SAFMX2 ) + IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20) $ GO TO 10 R = SQRT( F1**2+G1**2 ) CS = F1 / R diff --git a/lapack-netlib/SRC/dlartgp.f b/lapack-netlib/SRC/dlartgp.f index 0cb0d2d13..334e416e8 100644 --- a/lapack-netlib/SRC/dlartgp.f +++ b/lapack-netlib/SRC/dlartgp.f @@ -161,7 +161,7 @@ F1 = F1*SAFMN2 G1 = G1*SAFMN2 SCALE = MAX( ABS( F1 ), ABS( G1 ) ) - IF( SCALE.GE.SAFMX2 ) + IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20 ) $ GO TO 10 R = SQRT( F1**2+G1**2 ) CS = F1 / R diff --git a/lapack-netlib/SRC/slartg.f b/lapack-netlib/SRC/slartg.f index 784d4bc36..307c9c83a 100644 --- a/lapack-netlib/SRC/slartg.f +++ b/lapack-netlib/SRC/slartg.f @@ -163,7 +163,7 @@ F1 = F1*SAFMN2 G1 = G1*SAFMN2 SCALE = MAX( ABS( F1 ), ABS( G1 ) ) - IF( SCALE.GE.SAFMX2 ) + IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20) $ GO TO 10 R = SQRT( F1**2+G1**2 ) CS = F1 / R diff --git a/lapack-netlib/SRC/slartgp.f b/lapack-netlib/SRC/slartgp.f index ad76c94b4..f8be5f52b 100644 --- a/lapack-netlib/SRC/slartgp.f +++ b/lapack-netlib/SRC/slartgp.f @@ -161,7 +161,7 @@ F1 = F1*SAFMN2 G1 = G1*SAFMN2 SCALE = MAX( ABS( F1 ), ABS( G1 ) ) - IF( SCALE.GE.SAFMX2 ) + IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20) $ GO TO 10 R = SQRT( F1**2+G1**2 ) CS = F1 / R diff --git a/lapack-netlib/SRC/zheequb.f b/lapack-netlib/SRC/zheequb.f index d698232e8..7d719f41e 100644 --- a/lapack-netlib/SRC/zheequb.f +++ b/lapack-netlib/SRC/zheequb.f @@ -271,7 +271,7 @@ AVG = AVG / N STD = 0.0D0 - DO I = N+1, N + DO I = N+1, 2*N WORK( I ) = S( I-N ) * WORK( I-N ) - AVG END DO CALL ZLASSQ( N, WORK( N+1 ), 1, SCALE, SUMSQ ) diff --git a/lapack-netlib/SRC/zlargv.f b/lapack-netlib/SRC/zlargv.f index 1e17983d5..f83ca1851 100644 --- a/lapack-netlib/SRC/zlargv.f +++ b/lapack-netlib/SRC/zlargv.f @@ -201,7 +201,7 @@ FS = FS*SAFMN2 GS = GS*SAFMN2 SCALE = SCALE*SAFMN2 - IF( SCALE.GE.SAFMX2 ) + IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20 ) $ GO TO 10 ELSE IF( SCALE.LE.SAFMN2 ) THEN IF( G.EQ.CZERO ) THEN diff --git a/lapack-netlib/SRC/zlartg.f b/lapack-netlib/SRC/zlartg.f index 8989bb896..894b4ded0 100644 --- a/lapack-netlib/SRC/zlartg.f +++ b/lapack-netlib/SRC/zlartg.f @@ -161,7 +161,7 @@ FS = FS*SAFMN2 GS = GS*SAFMN2 SCALE = SCALE*SAFMN2 - IF( SCALE.GE.SAFMX2 ) + IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20 ) $ GO TO 10 ELSE IF( SCALE.LE.SAFMN2 ) THEN IF( G.EQ.CZERO.OR.DISNAN( ABS( G ) ) ) THEN diff --git a/lapack-netlib/TESTING/CMakeLists.txt b/lapack-netlib/TESTING/CMakeLists.txt index d5ca95013..80e6b3232 100644 --- a/lapack-netlib/TESTING/CMakeLists.txt +++ b/lapack-netlib/TESTING/CMakeLists.txt @@ -1,3 +1,7 @@ +enable_language(Fortran) + +enable_testing() + if(MSVC_VERSION) # string(REPLACE "/STACK:10000000" "/STACK:900000000000000000" # CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}") @@ -168,3 +172,394 @@ if(PYTHONINTERP_FOUND) COMMAND ${PYTHON_EXECUTABLE} "lapack_testing.py" ) endif() + + + +# $1 exec, $2 input, $3 output_result +FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh +"rm -f $3\n" +"$1 < $2\n" +"grep -q FATAL $3\n" +"if [ $? -eq 0 ]; then\n" +"echo Error\n" +"exit 1\n" +"else\n" +"exit 0\n" +"fi\n" +) + + +add_test(NAME "REAL_LAPACK_linear_equation_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/stest.in" "${CMAKE_CURRENT_BINARY_DIR}/stest.out" +) +add_test(NAME "COMPLEX_LAPACK_linear_equation_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ctest.in" "${CMAKE_CURRENT_BINARY_DIR}/ctest.out" +) +add_test(NAME "DOUBLE_PRECISION_LAPACK_linear_equation_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN//xlintstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dtest.in" "${CMAKE_CURRENT_BINARY_DIR}/dtest.out" +) +add_test(NAME "COMPLEX16_LAPACK_linear_equation_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN//xlintstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ztest.in" "${CMAKE_CURRENT_BINARY_DIR}/ztest.out" +) + +add_test(NAME "SINGLE-DOUBLE_PRECISION_LAPACK_prototype_linear_equation_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstds" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dstest.in" " ${CMAKE_CURRENT_BINARY_DIR}/dstest.out" +) +# ======== COMPLEX-COMPLEX16 LIN TESTS ======================== + +add_test(NAME "Testing_COMPLEX-COMPLEX16_LAPACK_prototype_linear_equation_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstzc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zctest.in" " ${CMAKE_CURRENT_BINARY_DIR}/zctest.out" +) + +# ======== SINGLE RFP LIN TESTS ======================== + +add_test(NAME "Testing_REAL_LAPACK_RFP_prototype_linear_equation_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfs" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/stest_rfp.in" "${CMAKE_CURRENT_BINARY_DIR}/stest_rfp.out" +) + +# ======== COMPLEX16 RFP LIN TESTS ======================== + +add_test(NAME "Testing_DOUBLE_PRECISION_LAPACK_RFP_prototype_linear_equation_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dtest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/dtest_rfp.out" +) +# ======== COMPLEX16 RFP LIN TESTS ======================== + +add_test(NAME "Testing_COMPLEX_LAPACK_RFP_prototype_linear_equation_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ctest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/ctest_rfp.out" +) + +# ======== COMPLEX16 RFP LIN TESTS ======================== + +add_test(NAME "Testing_COMPLEX16_LAPACK_RFP_prototype_linear_equation_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ztest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/ztest_rfp.out" +) +# +# +# ======== SINGLE EIG TESTS =========================== +# + +add_test(NAME "SNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/snep.out" +) + +add_test(NAME "SSEP:_Testing_Symmetric_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssep.out" +) + +add_test(NAME "SSE2:_Testing_Symmetric_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/sse2.out" +) + +add_test(NAME "SSVD:_Testing_Singular_Value_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssvd.out" +) + +add_test(NAME "SSEC:_Testing_REAL_Eigen_Condition_Routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sec.in" " ${CMAKE_CURRENT_BINARY_DIR}/sec.out" +) + +add_test(NAME "SSEV:_Testing_REAL_Nonsymmetric_Eigenvalue_Driver" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sed.in" " ${CMAKE_CURRENT_BINARY_DIR}/sed.out" +) + +add_test(NAME "SGG:_Testing_REAL_Nonsymmetric_Generalized_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgg.out" +) + +add_test(NAME "SGD:_Testing_REAL_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgd.out" +) + +add_test(NAME "SSB:_Testing_REAL_Symmetric_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ssb.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssb.out" +) + +add_test(NAME "SSG:_Testing_REAL_Symmetric_Generalized_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ssg.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssg.out" +) + +add_test(NAME "SGEBAL:_Testing_the_balancing_of_a_REAL_general_matrix" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbal.out" +) + +add_test(NAME "SGEBAK:_Testing_the_back_transformation_of_a_REAL_balanced_matrix" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbak.out" +) + +add_test(NAME "SGGBAL:_Testing_the_balancing_of_a_pair_of_REAL_general_matrices" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgbal.out" +) + +add_test(NAME "SGGBAK:_Testing_the_back_transformation_of_a_pair_of_REAL_balanced_matrices" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgbak.out" +) + +add_test(NAME "SBB:_Testing_banded_Singular_Value_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbb.out" +) + +add_test(NAME "SGLM:_Testing_Generalized_Linear_Regression_Model_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/sglm.out" +) + +add_test(NAME "SGQR:_Testing_Generalized_QR_and_RQ_factorization_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgqr.out" +) + +add_test(NAME "SGSV:_Testing_Generalized_Singular_Value_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" "${CMAKE_CURRENT_BINARY_DIR}/sgsv.out" +) + +add_test(NAME "SCSD:_Testing_CS_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/scsd.out" +) + +add_test(NAME "SLSE:_Testing_Constrained_Linear_Least_Squares_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/slse.out" +) + +# ======== COMPLEX EIG TESTS =========================== + +add_test(NAME "CNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/cnep.out" +) + +add_test(NAME "CSEP:_Testing_Symmetric_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/csep.out" +) + +add_test(NAME "CSE2:_Testing_Symmetric_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/cse2.out" +) + +add_test(NAME "CSVD:_Testing_Singular_Value_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/csvd.out" +) + +add_test(NAME "CEC:_Testing_COMPLEX_Eigen_Condition_Routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cec.in" " ${CMAKE_CURRENT_BINARY_DIR}/cec.out" +) + +add_test(NAME "CES:_Testing_COMPLEX_Nonsymmetric_Schur_Form_Driver" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ced.in" " ${CMAKE_CURRENT_BINARY_DIR}/ced.out" +) + +add_test(NAME "CGG:_Testing_COMPLEX_Nonsymmetric_Generalized_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgg.out" +) + +add_test(NAME "CGD:_Testing_COMPLEX_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgd.out" +) + +add_test(NAME "CHB:_Testing_Hermitian_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csb.in" " ${CMAKE_CURRENT_BINARY_DIR}/csb.out" +) + +add_test(NAME "CSG:_Testing_Symmetric_Generalized_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csg.in" " ${CMAKE_CURRENT_BINARY_DIR}/csg.out" +) + +add_test(NAME "CGEBAL:_Testing_the_balancing_of_a_COMPLEX_general_matrix" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbal.out" +) + +add_test(NAME "CGEBAK:_Testing_the_back_transformation_of_a_COMPLEX_balanced_matrix" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbak.out" +) + +add_test(NAME "CGGBAL:_Testing_the_balancing_of_a_pair_of_COMPLEX_general_matrices" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgbal.out" +) + +add_test(NAME "CGGBAK:_Testing_the_back_transformation_of_a_pair_of_COMPLEX_balanced_matrices" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgbak.out" +) + +add_test(NAME "CBB:_Testing_banded_Singular_Value_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbb.out" +) + +add_test(NAME "CGLM:_Testing_Generalized_Linear_Regression_Model_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/cglm.out" +) + +add_test(NAME "CGQR:_Testing_Generalized_QR_and_RQ_factorization_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgqr.out" +) + +add_test(NAME "CGSV:_Testing_Generalized_Singular_Value_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgsv.out" +) + +add_test(NAME "CCSD:_Testing_CS_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/ccsd.out" +) + +add_test(NAME "CLSE:_Testing_Constrained_Linear_Least_Squares_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/clse.out" +) + +# ======== DOUBLE EIG TESTS =========================== + +add_test(NAME "DNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/dnep.out" +) + +add_test(NAME "DSEP:_Testing_Symmetric_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsep.out" +) + +add_test(NAME "DSE2:_Testing_Symmetric_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/dse2.out" +) + +add_test(NAME "DSVD:_Testing_Singular_Value_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsvd.out" +) + +add_test(NAME "DEC:_Testing_DOUBLE_PRECISION_Eigen_Condition_Routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dec.in" " ${CMAKE_CURRENT_BINARY_DIR}/dec.out" +) + +add_test(NAME "DEV:_Testing_DOUBLE_PRECISION_Nonsymmetric_Eigenvalue_Driver" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ded.in" " ${CMAKE_CURRENT_BINARY_DIR}/ded.out" +) + +add_test(NAME "DGG:_Testing_DOUBLE_PRECISION_Nonsymmetric_Generalized_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgg.out" +) + +add_test(NAME "DGD:_Testing_DOUBLE_PRECISION_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgd.out" +) + +add_test(NAME "DSB:_Testing_DOUBLE_PRECISION_Symmetric_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dsb.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsb.out" +) + +add_test(NAME "DSG:_Testing_DOUBLE_PRECISION_Symmetric_Generalized_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dsg.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsg.out" +) + +add_test(NAME "DGEBAL:_Testing_the_balancing_of_a_DOUBLE_PRECISION_general_matrix" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbal.out" +) + +add_test(NAME "DGEBAK:_Testing_the_back_transformation_of_a_DOUBLE_PRECISION_balanced_matrix" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbak.out" +) + +add_test(NAME "DGGBAL:_Testing_the_balancing_of_a_pair_of_DOUBLE_PRECISION_general_matrices" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgbal.out" +) + +add_test(NAME "DGGBAK:_Testing_the_back_transformation_of_a_pair_of_DOUBLE_PRECISION_balanced_matrices" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgbak.out" +) + +add_test(NAME "DBB:_Testing_banded_Singular_Value_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbb.out" +) + +add_test(NAME "DGLM:_Testing_Generalized_Linear_Regression_Model_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/dglm.out" +) + +add_test(NAME "DGQR:_Testing_Generalized_QR_and_RQ_factorization_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgqr.out" +) + +add_test(NAME "DGSV:_Testing_Generalized_Singular_Value_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgsv.out" +) + +add_test(NAME "DCSD:_Testing_CS_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dcsd.out" +) + +add_test(NAME "DLSE:_Testing_Constrained_Linear_Least_Squares_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/dlse.out" +) + +# ======== COMPLEX16 EIG TESTS =========================== + +add_test(NAME "ZNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/znep.out" +) + +add_test(NAME "ZSEP:_Testing_Symmetric_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsep.out" +) + +add_test(NAME "ZSE2:_Testing_Symmetric_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/zse2.out" +) + +add_test(NAME "ZSVD:_Testing_Singular_Value_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsvd.out" +) + +add_test(NAME "ZEC:_Testing_COMPLEX16_Eigen_Condition_Routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zec.in" " ${CMAKE_CURRENT_BINARY_DIR}/zec.out" +) + +add_test(NAME "ZES:_Testing_COMPLEX16_Nonsymmetric_Schur_Form_Driver" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zed.in" " ${CMAKE_CURRENT_BINARY_DIR}/zed.out" +) + +add_test(NAME "ZGG:_Testing_COMPLEX16_Nonsymmetric_Generalized_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgg.out" +) + +add_test(NAME "ZGD:_Testing_COMPLEX16_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgd.out" +) + +add_test(NAME "ZHB:_Testing_Hermitian_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zsb.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsb.out" +) + +add_test(NAME "ZSG:_Testing_Symmetric_Generalized_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zsg.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsg.out" +) + +add_test(NAME "ZGEBAL:_Testing_the_balancing_of_a_COMPLEX16_general_matrix" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbal.out" +) + +add_test(NAME "ZGEBAK:_Testing_the_back_transformation_of_a_COMPLEX16_balanced_matrix" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbak.out" +) + +add_test(NAME "ZGGBAL:_Testing_the_balancing_of_a_pair_of_COMPLEX_general_matrices" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgbal.out" +) + +add_test(NAME "ZGGBAK:_Testing_the_back_transformation_of_a_pair_of_COMPLEX16_balanced_matrices" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgbak.out" +) + +add_test(NAME "ZBB:_Testing_banded_Singular_Value_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbb.out" +) + +add_test(NAME "ZGLM:_Testing_Generalized_Linear_Regression_Model_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/zglm.out" +) + +add_test(NAME "ZGQR:_Testing_Generalized_QR_and_RQ_factorization_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgqr.out" +) + +add_test(NAME "ZGSV:_Testing_Generalized_Singular_Value_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgsv.out" +) + +add_test(NAME "ZCSD:_Testing_CS_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zcsd.out" +) + +add_test(NAME "Constrained_Linear_Least_Squares_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/zlse.out" +) diff --git a/lapack-netlib/TESTING/EIG/CMakeLists.txt b/lapack-netlib/TESTING/EIG/CMakeLists.txt index 20fd25b4a..e877b1422 100644 --- a/lapack-netlib/TESTING/EIG/CMakeLists.txt +++ b/lapack-netlib/TESTING/EIG/CMakeLists.txt @@ -98,7 +98,7 @@ set(ZEIGTST zchkee.f macro(add_eig_executable name) add_executable(${name} ${ARGN}) - target_link_libraries(${name} tmglib ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) + target_link_libraries(${name} openblas${SUFFIX64_UNDERSCORE}) endmacro() if(BUILD_SINGLE) diff --git a/lapack-netlib/TESTING/LIN/CMakeLists.txt b/lapack-netlib/TESTING/LIN/CMakeLists.txt index c941d3577..0d0bb5418 100644 --- a/lapack-netlib/TESTING/LIN/CMakeLists.txt +++ b/lapack-netlib/TESTING/LIN/CMakeLists.txt @@ -239,7 +239,8 @@ set(ZLINTSTRFP zchkrfp.f zdrvrfp.f zdrvrf1.f zdrvrf2.f zdrvrf3.f zdrvrf4.f zerrr macro(add_lin_executable name) add_executable(${name} ${ARGN}) - target_link_libraries(${name} tmglib ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) + target_link_libraries(${name} openblas${SUFFIX64_UNDERSCORE}) +#${TMGLIB} ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) endmacro() if(BUILD_SINGLE) diff --git a/lapack-netlib/TESTING/LIN/cdrvls.f b/lapack-netlib/TESTING/LIN/cdrvls.f index d24e3885b..f43c10b72 100644 --- a/lapack-netlib/TESTING/LIN/cdrvls.f +++ b/lapack-netlib/TESTING/LIN/cdrvls.f @@ -372,13 +372,13 @@ END IF * Compute workspace needed for CGELSY CALL CGELSY( M, N, NRHS, A, LDA, B, LDB, - $ IWQ, RCOND, CRANK, WQ, -1, RWORK, + $ IWQ, RCOND, CRANK, WQ, -1, RWQ, $ INFO ) LWORK_CGELSY = INT( WQ( 1 ) ) LRWORK_CGELSY = 2*N * Compute workspace needed for CGELSS CALL CGELSS( M, N, NRHS, A, LDA, B, LDB, S, - $ RCOND, CRANK, WQ, -1, RWORK, INFO ) + $ RCOND, CRANK, WQ, -1, RWQ, INFO ) LWORK_CGELSS = INT( WQ( 1 ) ) LRWORK_CGELSS = 5*MNMIN * Compute workspace needed for CGELSD @@ -564,7 +564,7 @@ CALL CLARNV( 2, ISEED, NCOLS*NRHS, $ WORK ) CALL CSCAL( NCOLS*NRHS, - $ ONE / REAL( NCOLS ), WORK, + $ CONE / REAL( NCOLS ), WORK, $ 1 ) END IF CALL CGEMM( TRANS, 'No transpose', NROWS, diff --git a/lapack-netlib/TESTING/LIN/derrtsqr.f b/lapack-netlib/TESTING/LIN/derrtsqr.f index c8ad30257..d1d0ff02d 100644 --- a/lapack-netlib/TESTING/LIN/derrtsqr.f +++ b/lapack-netlib/TESTING/LIN/derrtsqr.f @@ -77,7 +77,7 @@ * .. * .. Local Arrays .. DOUBLE PRECISION A( NMAX, NMAX ), T( NMAX, NMAX ), W( NMAX ), - $ C( NMAX, NMAX ), TAU(NMAX) + $ C( NMAX, NMAX ), TAU(NMAX*2) * .. * .. External Subroutines .. EXTERNAL ALAESM, CHKXER, DGEQR, @@ -137,6 +137,8 @@ * TAU(1)=1 TAU(2)=1 + TAU(3)=1 + TAU(4)=1 SRNAMT = 'DGEMQR' NB=1 INFOT = 1 diff --git a/lapack-netlib/TESTING/LIN/serrtsqr.f b/lapack-netlib/TESTING/LIN/serrtsqr.f index f00f3e14b..7f91a3c39 100644 --- a/lapack-netlib/TESTING/LIN/serrtsqr.f +++ b/lapack-netlib/TESTING/LIN/serrtsqr.f @@ -77,7 +77,7 @@ * .. * .. Local Arrays .. REAL A( NMAX, NMAX ), T( NMAX, NMAX ), W( NMAX ), - $ C( NMAX, NMAX ), TAU(NMAX) + $ C( NMAX, NMAX ), TAU(NMAX*2) * .. * .. External Subroutines .. EXTERNAL ALAESM, CHKXER, SGEQR, @@ -137,6 +137,8 @@ * TAU(1)=1 TAU(2)=1 + TAU(3)=1 + TAU(4)=1 SRNAMT = 'SGEMQR' NB=1 INFOT = 1 diff --git a/lapack-netlib/TESTING/LIN/zdrvls.f b/lapack-netlib/TESTING/LIN/zdrvls.f index 4587c5686..1313c853b 100644 --- a/lapack-netlib/TESTING/LIN/zdrvls.f +++ b/lapack-netlib/TESTING/LIN/zdrvls.f @@ -372,12 +372,12 @@ END IF * Compute workspace needed for ZGELSY CALL ZGELSY( M, N, NRHS, A, LDA, B, LDB, IWQ, - $ RCOND, CRANK, WQ, -1, RWORK, INFO ) + $ RCOND, CRANK, WQ, -1, RWQ, INFO ) LWORK_ZGELSY = INT( WQ( 1 ) ) LRWORK_ZGELSY = 2*N * Compute workspace needed for ZGELSS CALL ZGELSS( M, N, NRHS, A, LDA, B, LDB, S, - $ RCOND, CRANK, WQ, -1 , RWORK, + $ RCOND, CRANK, WQ, -1 , RWQ, $ INFO ) LWORK_ZGELSS = INT( WQ( 1 ) ) LRWORK_ZGELSS = 5*MNMIN @@ -564,7 +564,7 @@ CALL ZLARNV( 2, ISEED, NCOLS*NRHS, $ WORK ) CALL ZSCAL( NCOLS*NRHS, - $ ONE / DBLE( NCOLS ), WORK, + $ CONE / DBLE( NCOLS ), WORK, $ 1 ) END IF CALL ZGEMM( TRANS, 'No transpose', NROWS, diff --git a/lapack/getrf/getrf_parallel.c b/lapack/getrf/getrf_parallel.c index c602822a8..fc410b0e7 100644 --- a/lapack/getrf/getrf_parallel.c +++ b/lapack/getrf/getrf_parallel.c @@ -68,7 +68,7 @@ double sqrt(double); #define GETRF_FACTOR 1.00 -#if (__STDC_VERSION__ >= 201112L) +#ifdef HAVE_C11 #define atomic_load_long(p) __atomic_load_n(p, __ATOMIC_RELAXED) #define atomic_store_long(p, v) __atomic_store_n(p, v, __ATOMIC_RELAXED) #else diff --git a/lapack/getrf/potrf_parallel.c b/lapack/getrf/potrf_parallel.c deleted file mode 100644 index 312509685..000000000 --- a/lapack/getrf/potrf_parallel.c +++ /dev/null @@ -1,667 +0,0 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -#include -#include "common.h" - -#ifndef USE_SIMPLE_THREADED_LEVEL3 - -//The array of job_t may overflow the stack. -//Instead, use malloc to alloc job_t. -#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD -#define USE_ALLOC_HEAP -#endif - - -static FLOAT dm1 = -1.; - -#ifndef KERNEL_FUNC -#ifndef LOWER -#define KERNEL_FUNC SYRK_KERNEL_U -#else -#define KERNEL_FUNC SYRK_KERNEL_L -#endif -#endif - -#ifndef LOWER -#ifndef COMPLEX -#define TRSM_KERNEL TRSM_KERNEL_LT -#else -#define TRSM_KERNEL TRSM_KERNEL_LC -#endif -#else -#ifndef COMPLEX -#define TRSM_KERNEL TRSM_KERNEL_RN -#else -#define TRSM_KERNEL TRSM_KERNEL_RR -#endif -#endif - -#ifndef CACHE_LINE_SIZE -#define CACHE_LINE_SIZE 8 -#endif - -#ifndef DIVIDE_RATE -#define DIVIDE_RATE 2 -#endif - -#ifndef SWITCH_RATIO -#define SWITCH_RATIO 2 -#endif - -#ifndef LOWER -#define TRANS -#endif - -#ifndef SYRK_LOCAL -#if !defined(LOWER) && !defined(TRANS) -#define SYRK_LOCAL SYRK_UN -#elif !defined(LOWER) && defined(TRANS) -#define SYRK_LOCAL SYRK_UT -#elif defined(LOWER) && !defined(TRANS) -#define SYRK_LOCAL SYRK_LN -#else -#define SYRK_LOCAL SYRK_LT -#endif -#endif - -typedef struct { -#if __STDC_VERSION__ >= 201112L - _Atomic -#else - volatile -#endif - BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; -} job_t; - - -#ifndef KERNEL_OPERATION -#ifndef COMPLEX -#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ - KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y)) -#else -#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ - KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y)) -#endif -#endif - -#ifndef ICOPY_OPERATION -#ifndef TRANS -#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); -#else -#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); -#endif -#endif - -#ifndef OCOPY_OPERATION -#ifdef TRANS -#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); -#else -#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); -#endif -#endif - -#ifndef S -#define S args -> a -#endif -#ifndef A -#define A args -> b -#endif -#ifndef C -#define C args -> c -#endif -#ifndef LDA -#define LDA args -> lda -#endif -#ifndef N -#define N args -> m -#endif -#ifndef K -#define K args -> k -#endif - -static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ - - FLOAT *buffer[DIVIDE_RATE]; - - BLASLONG k, lda; - BLASLONG m_from, m_to; - - FLOAT *alpha; - FLOAT *a, *c; - job_t *job = (job_t *)args -> common; - BLASLONG xxx, bufferside; - - BLASLONG jjs, min_jj; - BLASLONG is, min_i, div_n; - - BLASLONG i, current; - - k = K; - - a = (FLOAT *)A; - c = (FLOAT *)C; - - lda = LDA; - - alpha = (FLOAT *)args -> alpha; - - m_from = range_n[mypos + 0]; - m_to = range_n[mypos + 1]; - -#if 0 - fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld\n", mypos, m_from, m_to); -#endif - - div_n = (((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; - - buffer[0] = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); - for (i = 1; i < DIVIDE_RATE; i++) { - buffer[i] = buffer[i - 1] + GEMM_Q * div_n * COMPSIZE; - } - -#ifndef LOWER - TRSM_IUNCOPY(k, k, (FLOAT *)S, lda, 0, sb); -#else - TRSM_OLTCOPY(k, k, (FLOAT *)S, lda, 0, sb); -#endif - - for (xxx = m_from, bufferside = 0; xxx < m_to; xxx += div_n, bufferside ++) { - - for(jjs = xxx; jjs < MIN(m_to, xxx + div_n); jjs += min_jj){ - - min_jj = MIN(m_to, xxx + div_n) - jjs; - -#ifndef LOWER - if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; -#else - if (min_jj > GEMM_P) min_jj = GEMM_P; -#endif - -#ifndef LOWER - OCOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE); - - TRSM_KERNEL (k, min_jj, k, dm1, -#ifdef COMPLEX - ZERO, -#endif - sb, - buffer[bufferside] + k * (jjs - xxx) * COMPSIZE, - a + jjs * lda * COMPSIZE, lda, 0); -#else - ICOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE); - - TRSM_KERNEL (min_jj, k, k, dm1, -#ifdef COMPLEX - ZERO, -#endif - buffer[bufferside] + k * (jjs - xxx) * COMPSIZE, - sb, - a + jjs * COMPSIZE, lda, 0); -#endif - } - -#ifndef LOWER - for (i = 0; i <= mypos; i++) - job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; -#else - for (i = mypos; i < args -> nthreads; i++) - job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; -#endif - - WMB; - } - - min_i = m_to - m_from; - - if (min_i >= GEMM_P * 2) { - min_i = GEMM_P; - } else - if (min_i > GEMM_P) { - min_i = (((min_i + 1) / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; - } - -#ifndef LOWER - ICOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa); -#else - OCOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa); -#endif - - current = mypos; - -#ifndef LOWER - while (current < args -> nthreads) -#else - while (current >= 0) -#endif - { - div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; - - for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { - - /* thread has to wait */ - if (current != mypos) while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; - - KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha, - sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], - c, lda, m_from, xxx); - - if (m_from + min_i >= m_to) { - job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; - WMB; - } - } - -#ifndef LOWER - current ++; -#else - current --; -#endif - } - - for(is = m_from + min_i; is < m_to; is += min_i){ - min_i = m_to - is; - - if (min_i >= GEMM_P * 2) { - min_i = GEMM_P; - } else - if (min_i > GEMM_P) { - min_i = (((min_i + 1) / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; - } - -#ifndef LOWER - ICOPY_OPERATION(k, min_i, a, lda, 0, is, sa); -#else - OCOPY_OPERATION(k, min_i, a, lda, 0, is, sa); -#endif - - current = mypos; - -#ifndef LOWER - while (current < args -> nthreads) -#else - while (current >= 0) -#endif - { - div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; - - for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { - - KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha, - sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], - c, lda, is, xxx); - - if (is + min_i >= m_to) { - job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; - WMB; - } - } -#ifndef LOWER - current ++; -#else - current --; -#endif - } - } - - for (i = 0; i < args -> nthreads; i++) { - if (i != mypos) { - for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { - while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;}; - } - } - } - - return 0; - } - -static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){ - - blas_arg_t newarg; - -#ifndef USE_ALLOC_HEAP - job_t job[MAX_CPU_NUMBER]; -#else - job_t * job = NULL; -#endif - - blas_queue_t queue[MAX_CPU_NUMBER]; - - BLASLONG range[MAX_CPU_NUMBER + 100]; - - BLASLONG num_cpu; - - BLASLONG nthreads = args -> nthreads; - - BLASLONG width, i, j, k; - BLASLONG n, n_from, n_to; - int mode, mask; - double dnum; - -#ifndef COMPLEX -#ifdef XDOUBLE - mode = BLAS_XDOUBLE | BLAS_REAL; - mask = MAX(QGEMM_UNROLL_M, QGEMM_UNROLL_N) - 1; -#elif defined(DOUBLE) - mode = BLAS_DOUBLE | BLAS_REAL; - mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1; -#elif defined(HALF) - mode = BLAS_HALF | BLAS_REAL; - mask = MAX(SHGEMM_UNROLL_M, SHGEMM_UNROLL_N) - 1; -#else - mode = BLAS_SINGLE | BLAS_REAL; - mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1; -#endif -#else -#ifdef XDOUBLE - mode = BLAS_XDOUBLE | BLAS_COMPLEX; - mask = MAX(XGEMM_UNROLL_M, XGEMM_UNROLL_N) - 1; -#elif defined(DOUBLE) - mode = BLAS_DOUBLE | BLAS_COMPLEX; - mask = MAX(ZGEMM_UNROLL_M, ZGEMM_UNROLL_N) - 1; -#else - mode = BLAS_SINGLE | BLAS_COMPLEX; - mask = MAX(CGEMM_UNROLL_M, CGEMM_UNROLL_N) - 1; -#endif -#endif - - newarg.m = args -> m; - newarg.k = args -> k; - newarg.a = args -> a; - newarg.b = args -> b; - newarg.c = args -> c; - newarg.lda = args -> lda; - newarg.alpha = args -> alpha; - -#ifdef USE_ALLOC_HEAP - job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t)); - if(job==NULL){ - fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__); - exit(1); - } -#endif - - newarg.common = (void *)job; - - n_from = 0; - n_to = args -> m; - -#ifndef LOWER - - range[MAX_CPU_NUMBER] = n_to - n_from; - range[0] = 0; - num_cpu = 0; - i = 0; - n = n_to - n_from; - - dnum = (double)n * (double)n /(double)nthreads; - - while (i < n){ - - if (nthreads - num_cpu > 1) { - - double di = (double)i; - - width = ((((BLASLONG)(sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1)); - - if (num_cpu == 0) width = n - (((n - width)/(mask+1)) * (mask+1)); - - if ((width > n - i) || (width < mask)) width = n - i; - - } else { - width = n - i; - } - - range[MAX_CPU_NUMBER - num_cpu - 1] = range[MAX_CPU_NUMBER - num_cpu] - width; - - queue[num_cpu].mode = mode; - queue[num_cpu].routine = inner_thread; - queue[num_cpu].args = &newarg; - queue[num_cpu].range_m = NULL; - - queue[num_cpu].sa = NULL; - queue[num_cpu].sb = NULL; - queue[num_cpu].next = &queue[num_cpu + 1]; - - num_cpu ++; - i += width; - } - - for (i = 0; i < num_cpu; i ++) queue[i].range_n = &range[MAX_CPU_NUMBER - num_cpu]; - -#else - - range[0] = 0; - num_cpu = 0; - i = 0; - n = n_to - n_from; - - dnum = (double)n * (double)n /(double)nthreads; - - while (i < n){ - - if (nthreads - num_cpu > 1) { - - double di = (double)i; - - width = ((((BLASLONG)(sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1)); - - if ((width > n - i) || (width < mask)) width = n - i; - - } else { - width = n - i; - } - - range[num_cpu + 1] = range[num_cpu] + width; - - queue[num_cpu].mode = mode; - queue[num_cpu].routine = inner_thread; - queue[num_cpu].args = &newarg; - queue[num_cpu].range_m = NULL; - queue[num_cpu].range_n = range; - queue[num_cpu].sa = NULL; - queue[num_cpu].sb = NULL; - queue[num_cpu].next = &queue[num_cpu + 1]; - - num_cpu ++; - i += width; - } - -#endif - - newarg.nthreads = num_cpu; - - if (num_cpu) { - - for (j = 0; j < num_cpu; j++) { - for (i = 0; i < num_cpu; i++) { - for (k = 0; k < DIVIDE_RATE; k++) { - job[j].working[i][CACHE_LINE_SIZE * k] = 0; - } - } - } - - queue[0].sa = sa; - queue[0].sb = sb; - queue[num_cpu - 1].next = NULL; - - exec_blas(num_cpu, queue); - } - -#ifdef USE_ALLOC_HEAP - free(job); -#endif - - return 0; -} - -#endif - -blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { - - BLASLONG n, bk, i, blocking, lda; - BLASLONG info; - int mode; - blas_arg_t newarg; - FLOAT *a; - FLOAT alpha[2] = { -ONE, ZERO}; - -#ifndef COMPLEX -#ifdef XDOUBLE - mode = BLAS_XDOUBLE | BLAS_REAL; -#elif defined(DOUBLE) - mode = BLAS_DOUBLE | BLAS_REAL; -#else - mode = BLAS_SINGLE | BLAS_REAL; -#endif -#else -#ifdef XDOUBLE - mode = BLAS_XDOUBLE | BLAS_COMPLEX; -#elif defined(DOUBLE) - mode = BLAS_DOUBLE | BLAS_COMPLEX; -#else - mode = BLAS_SINGLE | BLAS_COMPLEX; -#endif -#endif - - if (args -> nthreads == 1) { -#ifndef LOWER - info = POTRF_U_SINGLE(args, NULL, NULL, sa, sb, 0); -#else - info = POTRF_L_SINGLE(args, NULL, NULL, sa, sb, 0); -#endif - return info; - } - - n = args -> n; - a = (FLOAT *)args -> a; - lda = args -> lda; - - if (range_n) n = range_n[1] - range_n[0]; - - if (n <= GEMM_UNROLL_N * 2) { -#ifndef LOWER - info = POTRF_U_SINGLE(args, NULL, range_n, sa, sb, 0); -#else - info = POTRF_L_SINGLE(args, NULL, range_n, sa, sb, 0); -#endif - return info; - } - - newarg.lda = lda; - newarg.ldb = lda; - newarg.ldc = lda; - newarg.alpha = alpha; - newarg.beta = NULL; - newarg.nthreads = args -> nthreads; - - blocking = ((n / 2 + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N; - if (blocking > GEMM_Q) blocking = GEMM_Q; - - for (i = 0; i < n; i += blocking) { - bk = n - i; - if (bk > blocking) bk = blocking; - - newarg.m = bk; - newarg.n = bk; - newarg.a = a + (i + i * lda) * COMPSIZE; - - info = CNAME(&newarg, NULL, NULL, sa, sb, 0); - if (info) return info + i; - - if (n - i - bk > 0) { -#ifndef USE_SIMPLE_THREADED_LEVEL3 - newarg.m = n - i - bk; - newarg.k = bk; -#ifndef LOWER - newarg.b = a + ( i + (i + bk) * lda) * COMPSIZE; -#else - newarg.b = a + ((i + bk) + i * lda) * COMPSIZE; -#endif - newarg.c = a + ((i + bk) + (i + bk) * lda) * COMPSIZE; - - thread_driver(&newarg, sa, sb); -#else - -#ifndef LOWER - newarg.m = bk; - newarg.n = n - i - bk; - newarg.a = a + (i + i * lda) * COMPSIZE; - newarg.b = a + (i + (i + bk) * lda) * COMPSIZE; - - gemm_thread_n(mode | BLAS_TRANSA_T, - &newarg, NULL, NULL, (void *)TRSM_LCUN, sa, sb, args -> nthreads); - - newarg.n = n - i - bk; - newarg.k = bk; - newarg.a = a + ( i + (i + bk) * lda) * COMPSIZE; - newarg.c = a + ((i + bk) + (i + bk) * lda) * COMPSIZE; - -#if 0 - HERK_THREAD_UC(&newarg, NULL, NULL, sa, sb, 0); -#else - syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T, - &newarg, NULL, NULL, (void *)HERK_UC, sa, sb, args -> nthreads); -#endif -#else - newarg.m = n - i - bk; - newarg.n = bk; - newarg.a = a + (i + i * lda) * COMPSIZE; - newarg.b = a + (i + bk + i * lda) * COMPSIZE; - - gemm_thread_m(mode | BLAS_RSIDE | BLAS_TRANSA_T | BLAS_UPLO, - &newarg, NULL, NULL, (void *)TRSM_RCLN, sa, sb, args -> nthreads); - - newarg.n = n - i - bk; - newarg.k = bk; - newarg.a = a + (i + bk + i * lda) * COMPSIZE; - newarg.c = a + (i + bk + (i + bk) * lda) * COMPSIZE; - -#if 0 - HERK_THREAD_LN(&newarg, NULL, NULL, sa, sb, 0); -#else - syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T | BLAS_UPLO, - &newarg, NULL, NULL, (void *)HERK_LN, sa, sb, args -> nthreads); -#endif -#endif - -#endif - } - } - return 0; -} diff --git a/lapack/potrf/potrf_parallel.c b/lapack/potrf/potrf_parallel.c index e61e8decb..008fcb8cc 100644 --- a/lapack/potrf/potrf_parallel.c +++ b/lapack/potrf/potrf_parallel.c @@ -101,7 +101,12 @@ static FLOAT dm1 = -1.; #endif typedef struct { - volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; +#ifdef HAVE_C11 + _Atomic +#else + volatile +#endif + BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; } job_t; @@ -375,6 +380,9 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){ #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_REAL; mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1; +#elif defined(HALF) + mode = BLAS_HALF | BLAS_REAL; + mask = MAX(SHGEMM_UNROLL_M, SHGEMM_UNROLL_N) - 1; #else mode = BLAS_SINGLE | BLAS_REAL; mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1; diff --git a/openblas_config_template.h b/openblas_config_template.h index 49aea1cab..9955e5c73 100644 --- a/openblas_config_template.h +++ b/openblas_config_template.h @@ -34,6 +34,10 @@ typedef long BLASLONG; typedef unsigned long BLASULONG; #endif +#ifndef BFLOAT16 +typedef unsigned short bfloat16; +#endif + #ifdef OPENBLAS_USE64BITINT typedef BLASLONG blasint; #else diff --git a/param.h b/param.h index 9fdf40fe2..1ab982dc5 100644 --- a/param.h +++ b/param.h @@ -1748,6 +1748,124 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#ifdef COOPERLAKE + +#define SNUMOPT 16 +#define DNUMOPT 8 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SYMV_P 8 + +#if defined(XDOUBLE) || defined(DOUBLE) +#define SWITCH_RATIO 8 +#define GEMM_PREFERED_SIZE 8 +#else +#define SWITCH_RATIO 16 +#define GEMM_PREFERED_SIZE 16 +#endif +#define USE_SGEMM_KERNEL_DIRECT 1 + +#ifdef ARCH_X86 + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#else + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define DGEMM_DEFAULT_UNROLL_M 16 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 8 +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 2 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#define SGEMM_DEFAULT_UNROLL_MN 32 +#define DGEMM_DEFAULT_UNROLL_MN 32 +#endif + +#ifdef ARCH_X86 + +#define SGEMM_DEFAULT_P 512 +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_P 512 +#define DGEMM_DEFAULT_R dgemm_r +#define QGEMM_DEFAULT_P 504 +#define QGEMM_DEFAULT_R qgemm_r +#define CGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_R 1024 +#define ZGEMM_DEFAULT_P 512 +#define ZGEMM_DEFAULT_R zgemm_r +#define XGEMM_DEFAULT_P 252 +#define XGEMM_DEFAULT_R xgemm_r +#define SGEMM_DEFAULT_Q 256 +#define DGEMM_DEFAULT_Q 256 +#define QGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_Q 192 +#define XGEMM_DEFAULT_Q 128 + +#else + +#define SGEMM_DEFAULT_P 640 +#define DGEMM_DEFAULT_P 192 +#define CGEMM_DEFAULT_P 384 +#define ZGEMM_DEFAULT_P 256 + +#define SGEMM_DEFAULT_Q 320 +#define DGEMM_DEFAULT_Q 384 +#define CGEMM_DEFAULT_Q 192 +#define ZGEMM_DEFAULT_Q 128 + +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_R 8640 +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r + +#define QGEMM_DEFAULT_Q 128 +#define QGEMM_DEFAULT_P 504 +#define QGEMM_DEFAULT_R qgemm_r +#define XGEMM_DEFAULT_P 252 +#define XGEMM_DEFAULT_R xgemm_r +#define XGEMM_DEFAULT_Q 128 + +#define CGEMM3M_DEFAULT_UNROLL_N 4 +#define CGEMM3M_DEFAULT_UNROLL_M 8 +#define ZGEMM3M_DEFAULT_UNROLL_N 4 +#define ZGEMM3M_DEFAULT_UNROLL_M 4 + +#define CGEMM3M_DEFAULT_P 320 +#define ZGEMM3M_DEFAULT_P 256 +#define XGEMM3M_DEFAULT_P 112 +#define CGEMM3M_DEFAULT_Q 320 +#define ZGEMM3M_DEFAULT_Q 256 +#define XGEMM3M_DEFAULT_Q 224 +#define CGEMM3M_DEFAULT_R 12288 +#define ZGEMM3M_DEFAULT_R 12288 +#define XGEMM3M_DEFAULT_R 12288 + +#endif +#endif #ifdef ATOM @@ -1974,7 +2092,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_N 4 -#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 @@ -2225,7 +2343,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 65536 #define GEMM_DEFAULT_ALIGN 0x0ffffUL - +#if defined(__32BIT__) +#warning using BINARY32==POWER6 +#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 4 +#else #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 8 #define DGEMM_DEFAULT_UNROLL_M 16 @@ -2234,7 +2362,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_N 2 - +#endif #define SGEMM_DEFAULT_P 1280UL #define DGEMM_DEFAULT_P 640UL #define CGEMM_DEFAULT_P 640UL @@ -2254,13 +2382,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_R 4096 #define DGEMM_DEFAULT_R 4096 #define CGEMM_DEFAULT_R 4096 -#define ZGEMM_DEFAULT_R 512 +#define ZGEMM_DEFAULT_R 4096 #define SYMV_P 8 #endif -#if defined(POWER9) +#if defined(POWER9) || defined(POWER10) #define SNUMOPT 16 #define DNUMOPT 8 @@ -2288,10 +2416,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_Q 1026 #define ZGEMM_DEFAULT_Q 1026 +#define SGEMM_DEFAULT_R 4096 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + #define SYMV_P 8 #endif +#if defined(POWER10) +#undef SHGEMM_DEFAULT_UNROLL_N +#undef SHGEMM_DEFAULT_UNROLL_M +#undef SHGEMM_DEFAULT_P +#undef SHGEMM_DEFAULT_R +#undef SHGEMM_DEFAULT_Q +#define SHGEMM_DEFAULT_UNROLL_M 16 +#define SHGEMM_DEFAULT_UNROLL_N 8 +#define SHGEMM_DEFAULT_P 832 +#define SHGEMM_DEFAULT_Q 1026 +#define SHGEMM_DEFAULT_R 4096 +#endif + #if defined(SPARC) && defined(V7) #define SNUMOPT 4 @@ -2618,7 +2764,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 -#if defined(CORTEXA53) || defined(CORTEXA57) || \ +#if defined(CORTEXA57) || \ defined(CORTEXA72) || defined(CORTEXA73) || \ defined(FALKOR) || defined(TSV110) || defined(EMAG8180) @@ -2664,6 +2810,35 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 2048 +#elif defined(CORTEXA53) + +#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_N 8 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P 256 +#define DGEMM_DEFAULT_P 160 +#define CGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P 128 + +#define SGEMM_DEFAULT_Q 256 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 112 + +#define SGEMM_DEFAULT_R 4096 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 2048 + #elif defined(THUNDERX) #define SGEMM_DEFAULT_UNROLL_M 4 @@ -2722,6 +2897,35 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 +#elif defined(THUNDERX3T110) + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 4 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 320 +#define CGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P 128 + +#define SGEMM_DEFAULT_Q 352 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 112 + +#define SGEMM_DEFAULT_R 4096 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + #elif defined(NEOVERSEN1) #define SGEMM_DEFAULT_UNROLL_M 16 @@ -2994,7 +3198,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL -#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_M 8 @@ -3006,12 +3210,12 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define ZGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_N 4 -#define SGEMM_DEFAULT_P 456 +#define SGEMM_DEFAULT_P 480 #define DGEMM_DEFAULT_P 320 #define CGEMM_DEFAULT_P 480 #define ZGEMM_DEFAULT_P 224 -#define SGEMM_DEFAULT_Q 488 +#define SGEMM_DEFAULT_Q 512 #define DGEMM_DEFAULT_Q 384 #define CGEMM_DEFAULT_Q 128 #define ZGEMM_DEFAULT_Q 352 diff --git a/test/Makefile b/test/Makefile index 7a873b7e5..45f9821ec 100644 --- a/test/Makefile +++ b/test/Makefile @@ -64,9 +64,17 @@ endif endif endif +ifeq ($(BUILD_HALF),1) +level3 : test_shgemm sblat3 dblat3 cblat3 zblat3 +else level3 : sblat3 dblat3 cblat3 zblat3 +endif ifndef CROSS rm -f ?BLAT3.SUMM +ifeq ($(BUILD_HALF),1) + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./test_shgemm > SHBLAT3.SUMM + @$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0 +endif OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat3 < ./sblat3.dat @$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0 OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./dblat3 < ./dblat3.dat @@ -78,6 +86,10 @@ ifndef CROSS ifdef SMP rm -f ?BLAT3.SUMM ifeq ($(USE_OPENMP), 1) +ifeq ($(BUILD_HALF),1) + OMP_NUM_THREADS=2 ./test_shgemm > SHBLAT3.SUMM + @$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0 +endif OMP_NUM_THREADS=2 ./sblat3 < ./sblat3.dat @$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0 OMP_NUM_THREADS=2 ./dblat3 < ./dblat3.dat @@ -87,6 +99,10 @@ ifeq ($(USE_OPENMP), 1) OMP_NUM_THREADS=2 ./zblat3 < ./zblat3.dat @$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0 else +ifeq ($(BUILD_HALF),1) + OPENBLAS_NUM_THREADS=2 ./test_shgemm > SHBLAT3.SUMM + @$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0 +endif OPENBLAS_NUM_THREADS=2 ./sblat3 < ./sblat3.dat @$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0 OPENBLAS_NUM_THREADS=2 ./dblat3 < ./dblat3.dat @@ -165,6 +181,11 @@ zblat2 : zblat2.$(SUFFIX) ../$(LIBNAME) sblat3 : sblat3.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o sblat3 sblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) +ifeq ($(BUILD_HALF),1) +test_shgemm : compare_sgemm_shgemm.c ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o test_shgemm compare_sgemm_shgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) +endif + dblat3 : dblat3.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o dblat3 dblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) @@ -187,7 +208,7 @@ clean: @rm -f *.$(SUFFIX) *.$(PSUFFIX) gmon.$(SUFFIX)ut *.SUMM *.cxml *.exe *.pdb *.dwf \ sblat1 dblat1 cblat1 zblat1 \ sblat2 dblat2 cblat2 zblat2 \ - sblat3 dblat3 cblat3 zblat3 \ + test_shgemm sblat3 dblat3 cblat3 zblat3 \ sblat1p dblat1p cblat1p zblat1p \ sblat2p dblat2p cblat2p zblat2p \ sblat3p dblat3p cblat3p zblat3p \ diff --git a/test/cblat1.f b/test/cblat1.f index d6b53d105..ecf2a44cb 100644 --- a/test/cblat1.f +++ b/test/cblat1.f @@ -1,7 +1,49 @@ +*> \brief \b CBLAT1 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* PROGRAM CBLAT1 +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> Test program for the COMPLEX Level 1 BLAS. +*> Based upon the original BLAS test routine together with: +*> +*> F06GAF Example Program Text +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date April 2012 +* +*> \ingroup complex_blas_testing +* +* ===================================================================== PROGRAM CBLAT1 -* Test program for the COMPLEX Level 1 BLAS. -* Based upon the original BLAS test routine together with: -* F06GAF Example Program Text +* +* -- Reference BLAS test routine (version 3.7.0) -- +* -- Reference BLAS is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* April 2012 +* +* ===================================================================== +* * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) @@ -114,8 +156,8 @@ + (5.0E0,6.0E0), (5.0E0,6.0E0), (0.1E0,0.1E0), + (-0.6E0,0.1E0), (0.1E0,-0.3E0), (7.0E0,8.0E0), + (7.0E0,8.0E0), (7.0E0,8.0E0), (7.0E0,8.0E0), - + (7.0E0,8.0E0), (0.3E0,0.1E0), (0.1E0,0.4E0), - + (0.4E0,0.1E0), (0.1E0,0.2E0), (2.0E0,3.0E0), + + (7.0E0,8.0E0), (0.3E0,0.1E0), (0.5E0,0.0E0), + + (0.0E0,0.5E0), (0.0E0,0.2E0), (2.0E0,3.0E0), + (2.0E0,3.0E0), (2.0E0,3.0E0), (2.0E0,3.0E0)/ DATA ((CV(I,J,2),I=1,8),J=1,5)/(0.1E0,0.1E0), + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), @@ -129,10 +171,10 @@ + (3.0E0,6.0E0), (-0.6E0,0.1E0), (4.0E0,7.0E0), + (0.1E0,-0.3E0), (7.0E0,2.0E0), (7.0E0,2.0E0), + (7.0E0,2.0E0), (0.3E0,0.1E0), (5.0E0,8.0E0), - + (0.1E0,0.4E0), (6.0E0,9.0E0), (0.4E0,0.1E0), - + (8.0E0,3.0E0), (0.1E0,0.2E0), (9.0E0,4.0E0)/ - DATA STRUE2/0.0E0, 0.5E0, 0.6E0, 0.7E0, 0.7E0/ - DATA STRUE4/0.0E0, 0.7E0, 1.0E0, 1.3E0, 1.7E0/ + + (0.5E0,0.0E0), (6.0E0,9.0E0), (0.0E0,0.5E0), + + (8.0E0,3.0E0), (0.0E0,0.2E0), (9.0E0,4.0E0)/ + DATA STRUE2/0.0E0, 0.5E0, 0.6E0, 0.7E0, 0.8E0/ + DATA STRUE4/0.0E0, 0.7E0, 1.0E0, 1.3E0, 1.6E0/ DATA ((CTRUE5(I,J,1),I=1,8),J=1,5)/(0.1E0,0.1E0), + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), @@ -145,8 +187,8 @@ + (0.11E0,-0.03E0), (-0.17E0,0.46E0), + (-0.17E0,-0.19E0), (7.0E0,8.0E0), (7.0E0,8.0E0), + (7.0E0,8.0E0), (7.0E0,8.0E0), (7.0E0,8.0E0), - + (0.19E0,-0.17E0), (0.32E0,0.09E0), - + (0.23E0,-0.24E0), (0.18E0,0.01E0), + + (0.19E0,-0.17E0), (0.20E0,-0.35E0), + + (0.35E0,0.20E0), (0.14E0,0.08E0), + (2.0E0,3.0E0), (2.0E0,3.0E0), (2.0E0,3.0E0), + (2.0E0,3.0E0)/ DATA ((CTRUE5(I,J,2),I=1,8),J=1,5)/(0.1E0,0.1E0), @@ -162,9 +204,9 @@ + (-0.17E0,0.46E0), (4.0E0,7.0E0), + (-0.17E0,-0.19E0), (7.0E0,2.0E0), (7.0E0,2.0E0), + (7.0E0,2.0E0), (0.19E0,-0.17E0), (5.0E0,8.0E0), - + (0.32E0,0.09E0), (6.0E0,9.0E0), - + (0.23E0,-0.24E0), (8.0E0,3.0E0), - + (0.18E0,0.01E0), (9.0E0,4.0E0)/ + + (0.20E0,-0.35E0), (6.0E0,9.0E0), + + (0.35E0,0.20E0), (8.0E0,3.0E0), + + (0.14E0,0.08E0), (9.0E0,4.0E0)/ DATA ((CTRUE6(I,J,1),I=1,8),J=1,5)/(0.1E0,0.1E0), + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), @@ -177,8 +219,8 @@ + (0.03E0,0.03E0), (-0.18E0,0.03E0), + (0.03E0,-0.09E0), (7.0E0,8.0E0), (7.0E0,8.0E0), + (7.0E0,8.0E0), (7.0E0,8.0E0), (7.0E0,8.0E0), - + (0.09E0,0.03E0), (0.03E0,0.12E0), - + (0.12E0,0.03E0), (0.03E0,0.06E0), (2.0E0,3.0E0), + + (0.09E0,0.03E0), (0.15E0,0.00E0), + + (0.00E0,0.15E0), (0.00E0,0.06E0), (2.0E0,3.0E0), + (2.0E0,3.0E0), (2.0E0,3.0E0), (2.0E0,3.0E0)/ DATA ((CTRUE6(I,J,2),I=1,8),J=1,5)/(0.1E0,0.1E0), + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), @@ -193,8 +235,8 @@ + (-0.18E0,0.03E0), (4.0E0,7.0E0), + (0.03E0,-0.09E0), (7.0E0,2.0E0), (7.0E0,2.0E0), + (7.0E0,2.0E0), (0.09E0,0.03E0), (5.0E0,8.0E0), - + (0.03E0,0.12E0), (6.0E0,9.0E0), (0.12E0,0.03E0), - + (8.0E0,3.0E0), (0.03E0,0.06E0), (9.0E0,4.0E0)/ + + (0.15E0,0.00E0), (6.0E0,9.0E0), (0.00E0,0.15E0), + + (8.0E0,3.0E0), (0.00E0,0.06E0), (9.0E0,4.0E0)/ DATA ITRUE3/0, 1, 2, 2, 2/ * .. Executable Statements .. DO 60 INCX = 1, 2 @@ -529,7 +571,8 @@ * * .. Parameters .. INTEGER NOUT - PARAMETER (NOUT=6) + REAL ZERO + PARAMETER (NOUT=6, ZERO=0.0E0) * .. Scalar Arguments .. REAL SFAC INTEGER LEN @@ -552,7 +595,7 @@ * DO 40 I = 1, LEN SD = SCOMP(I) - STRUE(I) - IF (SDIFF(ABS(SSIZE(I))+ABS(SFAC*SD),ABS(SSIZE(I))).EQ.0.0E0) + IF (ABS(SFAC*SD) .LE. ABS(SSIZE(I))*EPSILON(ZERO)) + GO TO 40 * * HERE SCOMP(I) IS NOT CLOSE TO STRUE(I). diff --git a/test/cblat2.f b/test/cblat2.f index 20f188100..8c7bac48e 100644 --- a/test/cblat2.f +++ b/test/cblat2.f @@ -1,68 +1,114 @@ +*> \brief \b CBLAT2 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* PROGRAM CBLAT2 +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> Test program for the COMPLEX Level 2 Blas. +*> +*> The program must be driven by a short data file. The first 18 records +*> of the file are read using list-directed input, the last 17 records +*> are read using the format ( A6, L2 ). An annotated example of a data +*> file can be obtained by deleting the first 3 characters from the +*> following 35 lines: +*> 'cblat2.out' NAME OF SUMMARY OUTPUT FILE +*> 6 UNIT NUMBER OF SUMMARY FILE +*> 'CBLA2T.SNAP' NAME OF SNAPSHOT OUTPUT FILE +*> -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +*> F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +*> F LOGICAL FLAG, T TO STOP ON FAILURES. +*> T LOGICAL FLAG, T TO TEST ERROR EXITS. +*> 16.0 THRESHOLD VALUE OF TEST RATIO +*> 6 NUMBER OF VALUES OF N +*> 0 1 2 3 5 9 VALUES OF N +*> 4 NUMBER OF VALUES OF K +*> 0 1 2 4 VALUES OF K +*> 4 NUMBER OF VALUES OF INCX AND INCY +*> 1 2 -1 -2 VALUES OF INCX AND INCY +*> 3 NUMBER OF VALUES OF ALPHA +*> (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +*> 3 NUMBER OF VALUES OF BETA +*> (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +*> CGEMV T PUT F FOR NO TEST. SAME COLUMNS. +*> CGBMV T PUT F FOR NO TEST. SAME COLUMNS. +*> CHEMV T PUT F FOR NO TEST. SAME COLUMNS. +*> CHBMV T PUT F FOR NO TEST. SAME COLUMNS. +*> CHPMV T PUT F FOR NO TEST. SAME COLUMNS. +*> CTRMV T PUT F FOR NO TEST. SAME COLUMNS. +*> CTBMV T PUT F FOR NO TEST. SAME COLUMNS. +*> CTPMV T PUT F FOR NO TEST. SAME COLUMNS. +*> CTRSV T PUT F FOR NO TEST. SAME COLUMNS. +*> CTBSV T PUT F FOR NO TEST. SAME COLUMNS. +*> CTPSV T PUT F FOR NO TEST. SAME COLUMNS. +*> CGERC T PUT F FOR NO TEST. SAME COLUMNS. +*> CGERU T PUT F FOR NO TEST. SAME COLUMNS. +*> CHER T PUT F FOR NO TEST. SAME COLUMNS. +*> CHPR T PUT F FOR NO TEST. SAME COLUMNS. +*> CHER2 T PUT F FOR NO TEST. SAME COLUMNS. +*> CHPR2 T PUT F FOR NO TEST. SAME COLUMNS. +*> +*> Further Details +*> =============== +*> +*> See: +*> +*> Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. +*> An extended set of Fortran Basic Linear Algebra Subprograms. +*> +*> Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics +*> and Computer Science Division, Argonne National Laboratory, +*> 9700 South Cass Avenue, Argonne, Illinois 60439, US. +*> +*> Or +*> +*> NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms +*> Group Ltd., NAG Central Office, 256 Banbury Road, Oxford +*> OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st +*> Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. +*> +*> +*> -- Written on 10-August-1987. +*> Richard Hanson, Sandia National Labs. +*> Jeremy Du Croz, NAG Central Office. +*> +*> 10-9-00: Change STATUS='NEW' to 'UNKNOWN' so that the testers +*> can be run multiple times without deleting generated +*> output files (susan) +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date April 2012 +* +*> \ingroup complex_blas_testing +* +* ===================================================================== PROGRAM CBLAT2 * -* Test program for the COMPLEX Level 2 Blas. +* -- Reference BLAS test routine (version 3.7.0) -- +* -- Reference BLAS is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* April 2012 * -* The program must be driven by a short data file. The first 18 records -* of the file are read using list-directed input, the last 17 records -* are read using the format ( A6, L2 ). An annotated example of a data -* file can be obtained by deleting the first 3 characters from the -* following 35 lines: -* 'CBLAT2.SUMM' NAME OF SUMMARY OUTPUT FILE -* 6 UNIT NUMBER OF SUMMARY FILE -* 'CBLA2T.SNAP' NAME OF SNAPSHOT OUTPUT FILE -* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) -* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO STOP ON FAILURES. -* T LOGICAL FLAG, T TO TEST ERROR EXITS. -* 16.0 THRESHOLD VALUE OF TEST RATIO -* 6 NUMBER OF VALUES OF N -* 0 1 2 3 5 9 VALUES OF N -* 4 NUMBER OF VALUES OF K -* 0 1 2 4 VALUES OF K -* 4 NUMBER OF VALUES OF INCX AND INCY -* 1 2 -1 -2 VALUES OF INCX AND INCY -* 3 NUMBER OF VALUES OF ALPHA -* (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA -* 3 NUMBER OF VALUES OF BETA -* (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA -* CGEMV T PUT F FOR NO TEST. SAME COLUMNS. -* CGBMV T PUT F FOR NO TEST. SAME COLUMNS. -* CHEMV T PUT F FOR NO TEST. SAME COLUMNS. -* CHBMV T PUT F FOR NO TEST. SAME COLUMNS. -* CHPMV T PUT F FOR NO TEST. SAME COLUMNS. -* CTRMV T PUT F FOR NO TEST. SAME COLUMNS. -* CTBMV T PUT F FOR NO TEST. SAME COLUMNS. -* CTPMV T PUT F FOR NO TEST. SAME COLUMNS. -* CTRSV T PUT F FOR NO TEST. SAME COLUMNS. -* CTBSV T PUT F FOR NO TEST. SAME COLUMNS. -* CTPSV T PUT F FOR NO TEST. SAME COLUMNS. -* CGERC T PUT F FOR NO TEST. SAME COLUMNS. -* CGERU T PUT F FOR NO TEST. SAME COLUMNS. -* CHER T PUT F FOR NO TEST. SAME COLUMNS. -* CHPR T PUT F FOR NO TEST. SAME COLUMNS. -* CHER2 T PUT F FOR NO TEST. SAME COLUMNS. -* CHPR2 T PUT F FOR NO TEST. SAME COLUMNS. -* -* See: -* -* Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. -* An extended set of Fortran Basic Linear Algebra Subprograms. -* -* Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics -* and Computer Science Division, Argonne National Laboratory, -* 9700 South Cass Avenue, Argonne, Illinois 60439, US. -* -* Or -* -* NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms -* Group Ltd., NAG Central Office, 256 Banbury Road, Oxford -* OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st -* Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. -* -* -* -- Written on 10-August-1987. -* Richard Hanson, Sandia National Labs. -* Jeremy Du Croz, NAG Central Office. +* ===================================================================== * * .. Parameters .. INTEGER NIN @@ -71,8 +117,8 @@ PARAMETER ( NSUBS = 17 ) COMPLEX ZERO, ONE PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) - REAL RZERO, RHALF, RONE - PARAMETER ( RZERO = 0.0, RHALF = 0.5, RONE = 1.0 ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) INTEGER NMAX, INCMAX PARAMETER ( NMAX = 65, INCMAX = 2 ) INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX @@ -126,7 +172,7 @@ * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT - OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) + OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. @@ -135,7 +181,7 @@ READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN - OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI @@ -240,14 +286,7 @@ * * Compute EPS (the machine precision). * - EPS = RONE - 90 CONTINUE - IF( SDIFF( RONE + EPS, RONE ).EQ.RZERO ) - $ GO TO 100 - EPS = RHALF*EPS - GO TO 90 - 100 CONTINUE - EPS = EPS + EPS + EPS = EPSILON(RZERO) WRITE( NOUT, FMT = 9998 )EPS * * Check the reliability of CMVCH using exact data. @@ -3079,7 +3118,6 @@ 50 CONTINUE END IF * - 60 CONTINUE LCERES = .TRUE. GO TO 80 70 CONTINUE diff --git a/test/cblat3.f b/test/cblat3.f index 5df1ddd64..a65e1364c 100644 --- a/test/cblat3.f +++ b/test/cblat3.f @@ -1,50 +1,96 @@ +*> \brief \b CBLAT3 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* PROGRAM CBLAT3 +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> Test program for the COMPLEX Level 3 Blas. +*> +*> The program must be driven by a short data file. The first 14 records +*> of the file are read using list-directed input, the last 9 records +*> are read using the format ( A6, L2 ). An annotated example of a data +*> file can be obtained by deleting the first 3 characters from the +*> following 23 lines: +*> 'cblat3.out' NAME OF SUMMARY OUTPUT FILE +*> 6 UNIT NUMBER OF SUMMARY FILE +*> 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +*> -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +*> F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +*> F LOGICAL FLAG, T TO STOP ON FAILURES. +*> T LOGICAL FLAG, T TO TEST ERROR EXITS. +*> 16.0 THRESHOLD VALUE OF TEST RATIO +*> 6 NUMBER OF VALUES OF N +*> 0 1 2 3 5 9 VALUES OF N +*> 3 NUMBER OF VALUES OF ALPHA +*> (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +*> 3 NUMBER OF VALUES OF BETA +*> (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +*> CGEMM T PUT F FOR NO TEST. SAME COLUMNS. +*> CHEMM T PUT F FOR NO TEST. SAME COLUMNS. +*> CSYMM T PUT F FOR NO TEST. SAME COLUMNS. +*> CTRMM T PUT F FOR NO TEST. SAME COLUMNS. +*> CTRSM T PUT F FOR NO TEST. SAME COLUMNS. +*> CHERK T PUT F FOR NO TEST. SAME COLUMNS. +*> CSYRK T PUT F FOR NO TEST. SAME COLUMNS. +*> CHER2K T PUT F FOR NO TEST. SAME COLUMNS. +*> CSYR2K T PUT F FOR NO TEST. SAME COLUMNS. +*> +*> Further Details +*> =============== +*> +*> See: +*> +*> Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. +*> A Set of Level 3 Basic Linear Algebra Subprograms. +*> +*> Technical Memorandum No.88 (Revision 1), Mathematics and +*> Computer Science Division, Argonne National Laboratory, 9700 +*> South Cass Avenue, Argonne, Illinois 60439, US. +*> +*> -- Written on 8-February-1989. +*> Jack Dongarra, Argonne National Laboratory. +*> Iain Duff, AERE Harwell. +*> Jeremy Du Croz, Numerical Algorithms Group Ltd. +*> Sven Hammarling, Numerical Algorithms Group Ltd. +*> +*> 10-9-00: Change STATUS='NEW' to 'UNKNOWN' so that the testers +*> can be run multiple times without deleting generated +*> output files (susan) +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date April 2012 +* +*> \ingroup complex_blas_testing +* +* ===================================================================== PROGRAM CBLAT3 * -* Test program for the COMPLEX Level 3 Blas. +* -- Reference BLAS test routine (version 3.7.0) -- +* -- Reference BLAS is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* April 2012 * -* The program must be driven by a short data file. The first 14 records -* of the file are read using list-directed input, the last 9 records -* are read using the format ( A6, L2 ). An annotated example of a data -* file can be obtained by deleting the first 3 characters from the -* following 23 lines: -* 'CBLAT3.SUMM' NAME OF SUMMARY OUTPUT FILE -* 6 UNIT NUMBER OF SUMMARY FILE -* 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE -* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) -* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO STOP ON FAILURES. -* T LOGICAL FLAG, T TO TEST ERROR EXITS. -* 16.0 THRESHOLD VALUE OF TEST RATIO -* 6 NUMBER OF VALUES OF N -* 0 1 2 3 5 9 VALUES OF N -* 3 NUMBER OF VALUES OF ALPHA -* (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA -* 3 NUMBER OF VALUES OF BETA -* (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA -* CGEMM T PUT F FOR NO TEST. SAME COLUMNS. -* CHEMM T PUT F FOR NO TEST. SAME COLUMNS. -* CSYMM T PUT F FOR NO TEST. SAME COLUMNS. -* CTRMM T PUT F FOR NO TEST. SAME COLUMNS. -* CTRSM T PUT F FOR NO TEST. SAME COLUMNS. -* CHERK T PUT F FOR NO TEST. SAME COLUMNS. -* CSYRK T PUT F FOR NO TEST. SAME COLUMNS. -* CHER2K T PUT F FOR NO TEST. SAME COLUMNS. -* CSYR2K T PUT F FOR NO TEST. SAME COLUMNS. -* -* See: -* -* Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. -* A Set of Level 3 Basic Linear Algebra Subprograms. -* -* Technical Memorandum No.88 (Revision 1), Mathematics and -* Computer Science Division, Argonne National Laboratory, 9700 -* South Cass Avenue, Argonne, Illinois 60439, US. -* -* -- Written on 8-February-1989. -* Jack Dongarra, Argonne National Laboratory. -* Iain Duff, AERE Harwell. -* Jeremy Du Croz, Numerical Algorithms Group Ltd. -* Sven Hammarling, Numerical Algorithms Group Ltd. +* ===================================================================== * * .. Parameters .. INTEGER NIN @@ -53,8 +99,8 @@ PARAMETER ( NSUBS = 9 ) COMPLEX ZERO, ONE PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) - REAL RZERO, RHALF, RONE - PARAMETER ( RZERO = 0.0, RHALF = 0.5, RONE = 1.0 ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) INTEGER NMAX PARAMETER ( NMAX = 65 ) INTEGER NIDMAX, NALMAX, NBEMAX @@ -103,7 +149,7 @@ * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT - OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) + OPEN( NOUT, FILE = SUMMRY ) NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. @@ -112,7 +158,7 @@ READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN - OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + OPEN( NTRA, FILE = SNAPS ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI @@ -189,14 +235,7 @@ * * Compute EPS (the machine precision). * - EPS = RONE - 70 CONTINUE - IF( SDIFF( RONE + EPS, RONE ).EQ.RZERO ) - $ GO TO 80 - EPS = RHALF*EPS - GO TO 70 - 80 CONTINUE - EPS = EPS + EPS + EPS = EPSILON(RZERO) WRITE( NOUT, FMT = 9998 )EPS * * Check the reliability of CMMCH using exact data. @@ -1301,8 +1340,6 @@ NC = 0 RESET = .TRUE. ERRMAX = RZERO - RALS = RONE - RBETS = RONE * DO 100 IN = 1, NIDIM N = IDIM( IN ) @@ -1948,7 +1985,7 @@ * * Tests the error exits from the Level 3 Blas. * Requires a special version of the error-handling routine XERBLA. -* ALPHA, RALPHA, BETA, RBETA, A, B and C should not need to be defined. +* A, B and C should not need to be defined. * * Auxiliary routine for test program for Level 3 Blas. * @@ -1958,12 +1995,19 @@ * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * +* 3-19-92: Initialize ALPHA, BETA, RALPHA, and RBETA (eca) +* 3-19-92: Fix argument 12 in calls to CSYMM and CHEMM +* with INFOT = 9 (eca) +* * .. Scalar Arguments .. INTEGER ISNUM, NOUT CHARACTER*6 SRNAMT * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK +* .. Parameters .. + REAL ONE, TWO + PARAMETER ( ONE = 1.0E0, TWO = 2.0E0 ) * .. Local Scalars .. COMPLEX ALPHA, BETA REAL RALPHA, RBETA @@ -1981,6 +2025,14 @@ * LERR is set to .TRUE. by the special version of XERBLA each time * it is called, and is then tested and re-set by CHKXER. LERR = .FALSE. +* +* Initialize ALPHA, BETA, RALPHA, and RBETA. +* + ALPHA = CMPLX( ONE, -ONE ) + BETA = CMPLX( TWO, -TWO ) + RALPHA = ONE + RBETA = TWO +* GO TO ( 10, 20, 30, 40, 50, 60, 70, 80, $ 90 )ISNUM 10 INFOT = 1 @@ -2207,16 +2259,16 @@ CALL CHEMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL CHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL CHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL CHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL CHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL CHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) @@ -2274,16 +2326,16 @@ CALL CSYMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL CSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL CSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL CSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL CSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL CSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) @@ -3270,7 +3322,6 @@ 50 CONTINUE END IF * - 60 CONTINUE LCERES = .TRUE. GO TO 80 70 CONTINUE diff --git a/test/compare_sgemm_shgemm.c b/test/compare_sgemm_shgemm.c index 978972b24..57aee7b8f 100644 --- a/test/compare_sgemm_shgemm.c +++ b/test/compare_sgemm_shgemm.c @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include #include -#include "common.h" +#include "../common.h" #define SGEMM BLASFUNC(sgemm) #define SHGEMM BLASFUNC(shgemm) typedef union @@ -46,50 +46,86 @@ typedef union } bits; } bfloat16_bits; +typedef union +{ + float v; + struct + { + uint32_t m:23; + uint32_t e:8; + uint32_t s:1; + } bits; +} float32_bits; + +float +float16to32 (bfloat16_bits f16) +{ + float32_bits f32; + f32.bits.s = f16.bits.s; + f32.bits.e = f16.bits.e; + f32.bits.m = (uint32_t) f16.bits.m << 16; + return f32.v; +} + int main (int argc, char *argv[]) { int m, n, k; int i, j, l; + int x; int ret = 0; - int loop = 20; + int loop = 100; char transA = 'N', transB = 'N'; float alpha = 1.0, beta = 0.0; - char transa = 'N'; - char transb = 'N'; - for (int x = 0; x <= loop; x++) + for (x = 0; x <= loop; x++) { m = k = n = x; float A[m * k]; float B[k * n]; float C[m * n]; bfloat16_bits AA[m * k], BB[k * n]; - float CC[m * n]; + float DD[m * n], CC[m * n]; - for (int j = 0; j < m; j++) + for (j = 0; j < m; j++) { - for (int i = 0; i < m; i++) + for (i = 0; i < m; i++) { - A[j * k + i] = j * 9.0; - B[j * k + i] = i * 2.0; + A[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; + B[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; C[j * k + i] = 0; AA[j * k + i].v = *(uint32_t *) & A[j * k + i] >> 16; BB[j * k + i].v = *(uint32_t *) & B[j * k + i] >> 16; CC[j * k + i] = 0; + DD[j * k + i] = 0; } } SGEMM (&transA, &transB, &m, &n, &k, &alpha, A, - &m, B, &k, &beta, C, &m); + &m, B, &k, &beta, C, &m); SHGEMM (&transA, &transB, &m, &n, &k, &alpha, AA, - &m, BB, &k, &beta, CC, &m); - + &m, BB, &k, &beta, CC, &m); for (i = 0; i < n; i++) for (j = 0; j < m; j++) for (l = 0; l < k; l++) - if (CC[i * m + j] != C[i * m + j]) + if (fabs (CC[i * m + j] - C[i * m + j]) > 1.0) ret++; + if (transA == 'N' && transB == 'N') + { + for (i = 0; i < n; i++) + for (j = 0; j < m; j++) + for (l = 0; l < k; l++) + { + DD[i * m + j] += + float16to32 (AA[l * m + j]) * float16to32 (BB[l + k * i]); + } + for (i = 0; i < n; i++) + for (j = 0; j < m; j++) + for (l = 0; l < k; l++) + if (CC[i * m + j] != DD[i * m + j]) + ret++; + } } - fprintf (stderr, "Return code: %d\n", ret); + if (ret != 0) + fprintf (stderr, "FATAL ERROR SHGEMM - Return code: %d\n", ret); return ret; } diff --git a/test/dblat2.f b/test/dblat2.f index 4002d4368..9bbbe9792 100644 --- a/test/dblat2.f +++ b/test/dblat2.f @@ -1,75 +1,121 @@ +*> \brief \b DBLAT2 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* PROGRAM DBLAT2 +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> Test program for the DOUBLE PRECISION Level 2 Blas. +*> +*> The program must be driven by a short data file. The first 18 records +*> of the file are read using list-directed input, the last 16 records +*> are read using the format ( A6, L2 ). An annotated example of a data +*> file can be obtained by deleting the first 3 characters from the +*> following 34 lines: +*> 'dblat2.out' NAME OF SUMMARY OUTPUT FILE +*> 6 UNIT NUMBER OF SUMMARY FILE +*> 'DBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE +*> -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +*> F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +*> F LOGICAL FLAG, T TO STOP ON FAILURES. +*> T LOGICAL FLAG, T TO TEST ERROR EXITS. +*> 16.0 THRESHOLD VALUE OF TEST RATIO +*> 6 NUMBER OF VALUES OF N +*> 0 1 2 3 5 9 VALUES OF N +*> 4 NUMBER OF VALUES OF K +*> 0 1 2 4 VALUES OF K +*> 4 NUMBER OF VALUES OF INCX AND INCY +*> 1 2 -1 -2 VALUES OF INCX AND INCY +*> 3 NUMBER OF VALUES OF ALPHA +*> 0.0 1.0 0.7 VALUES OF ALPHA +*> 3 NUMBER OF VALUES OF BETA +*> 0.0 1.0 0.9 VALUES OF BETAC +*> DGEMV T PUT F FOR NO TEST. SAME COLUMNS. +*> DGBMV T PUT F FOR NO TEST. SAME COLUMNS. +*> DSYMV T PUT F FOR NO TEST. SAME COLUMNS. +*> DSBMV T PUT F FOR NO TEST. SAME COLUMNS. +*> DSPMV T PUT F FOR NO TEST. SAME COLUMNS. +*> DTRMV T PUT F FOR NO TEST. SAME COLUMNS. +*> DTBMV T PUT F FOR NO TEST. SAME COLUMNS. +*> DTPMV T PUT F FOR NO TEST. SAME COLUMNS. +*> DTRSV T PUT F FOR NO TEST. SAME COLUMNS. +*> DTBSV T PUT F FOR NO TEST. SAME COLUMNS. +*> DTPSV T PUT F FOR NO TEST. SAME COLUMNS. +*> DGER T PUT F FOR NO TEST. SAME COLUMNS. +*> DSYR T PUT F FOR NO TEST. SAME COLUMNS. +*> DSPR T PUT F FOR NO TEST. SAME COLUMNS. +*> DSYR2 T PUT F FOR NO TEST. SAME COLUMNS. +*> DSPR2 T PUT F FOR NO TEST. SAME COLUMNS. +*> +*> Further Details +*> =============== +*> +*> See: +*> +*> Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. +*> An extended set of Fortran Basic Linear Algebra Subprograms. +*> +*> Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics +*> and Computer Science Division, Argonne National Laboratory, +*> 9700 South Cass Avenue, Argonne, Illinois 60439, US. +*> +*> Or +*> +*> NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms +*> Group Ltd., NAG Central Office, 256 Banbury Road, Oxford +*> OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st +*> Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. +*> +*> +*> -- Written on 10-August-1987. +*> Richard Hanson, Sandia National Labs. +*> Jeremy Du Croz, NAG Central Office. +*> +*> 10-9-00: Change STATUS='NEW' to 'UNKNOWN' so that the testers +*> can be run multiple times without deleting generated +*> output files (susan) +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date April 2012 +* +*> \ingroup double_blas_testing +* +* ===================================================================== PROGRAM DBLAT2 * -* Test program for the DOUBLE PRECISION Level 2 Blas. +* -- Reference BLAS test routine (version 3.7.0) -- +* -- Reference BLAS is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* April 2012 * -* The program must be driven by a short data file. The first 18 records -* of the file are read using list-directed input, the last 16 records -* are read using the format ( A6, L2 ). An annotated example of a data -* file can be obtained by deleting the first 3 characters from the -* following 34 lines: -* 'DBLAT2.SUMM' NAME OF SUMMARY OUTPUT FILE -* 6 UNIT NUMBER OF SUMMARY FILE -* 'DBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE -* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) -* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO STOP ON FAILURES. -* T LOGICAL FLAG, T TO TEST ERROR EXITS. -* 16.0 THRESHOLD VALUE OF TEST RATIO -* 6 NUMBER OF VALUES OF N -* 0 1 2 3 5 9 VALUES OF N -* 4 NUMBER OF VALUES OF K -* 0 1 2 4 VALUES OF K -* 4 NUMBER OF VALUES OF INCX AND INCY -* 1 2 -1 -2 VALUES OF INCX AND INCY -* 3 NUMBER OF VALUES OF ALPHA -* 0.0 1.0 0.7 VALUES OF ALPHA -* 3 NUMBER OF VALUES OF BETA -* 0.0 1.0 0.9 VALUES OF BETA -* DGEMV T PUT F FOR NO TEST. SAME COLUMNS. -* DGBMV T PUT F FOR NO TEST. SAME COLUMNS. -* DSYMV T PUT F FOR NO TEST. SAME COLUMNS. -* DSBMV T PUT F FOR NO TEST. SAME COLUMNS. -* DSPMV T PUT F FOR NO TEST. SAME COLUMNS. -* DTRMV T PUT F FOR NO TEST. SAME COLUMNS. -* DTBMV T PUT F FOR NO TEST. SAME COLUMNS. -* DTPMV T PUT F FOR NO TEST. SAME COLUMNS. -* DTRSV T PUT F FOR NO TEST. SAME COLUMNS. -* DTBSV T PUT F FOR NO TEST. SAME COLUMNS. -* DTPSV T PUT F FOR NO TEST. SAME COLUMNS. -* DGER T PUT F FOR NO TEST. SAME COLUMNS. -* DSYR T PUT F FOR NO TEST. SAME COLUMNS. -* DSPR T PUT F FOR NO TEST. SAME COLUMNS. -* DSYR2 T PUT F FOR NO TEST. SAME COLUMNS. -* DSPR2 T PUT F FOR NO TEST. SAME COLUMNS. -* -* See: -* -* Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. -* An extended set of Fortran Basic Linear Algebra Subprograms. -* -* Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics -* and Computer Science Division, Argonne National Laboratory, -* 9700 South Cass Avenue, Argonne, Illinois 60439, US. -* -* Or -* -* NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms -* Group Ltd., NAG Central Office, 256 Banbury Road, Oxford -* OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st -* Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. -* -* -* -- Written on 10-August-1987. -* Richard Hanson, Sandia National Labs. -* Jeremy Du Croz, NAG Central Office. +* ===================================================================== * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 5 ) INTEGER NSUBS PARAMETER ( NSUBS = 16 ) - DOUBLE PRECISION ZERO, HALF, ONE - PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) INTEGER NMAX, INCMAX PARAMETER ( NMAX = 65, INCMAX = 2 ) INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX @@ -121,7 +167,7 @@ * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT - OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) + OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. @@ -130,7 +176,7 @@ READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN - OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI @@ -235,14 +281,7 @@ * * Compute EPS (the machine precision). * - EPS = ONE - 90 CONTINUE - IF( DDIFF( ONE + EPS, ONE ).EQ.ZERO ) - $ GO TO 100 - EPS = HALF*EPS - GO TO 90 - 100 CONTINUE - EPS = EPS + EPS + EPS = EPSILON(ZERO) WRITE( NOUT, FMT = 9998 )EPS * * Check the reliability of DMVCH using exact data. @@ -2982,7 +3021,6 @@ 50 CONTINUE END IF * - 60 CONTINUE LDERES = .TRUE. GO TO 80 70 CONTINUE diff --git a/test/dblat3.f b/test/dblat3.f index 082e03e5e..1ebec4ffa 100644 --- a/test/dblat3.f +++ b/test/dblat3.f @@ -1,55 +1,101 @@ +*> \brief \b DBLAT3 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* PROGRAM DBLAT3 +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> Test program for the DOUBLE PRECISION Level 3 Blas. +*> +*> The program must be driven by a short data file. The first 14 records +*> of the file are read using list-directed input, the last 6 records +*> are read using the format ( A6, L2 ). An annotated example of a data +*> file can be obtained by deleting the first 3 characters from the +*> following 20 lines: +*> 'dblat3.out' NAME OF SUMMARY OUTPUT FILE +*> 6 UNIT NUMBER OF SUMMARY FILE +*> 'DBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +*> -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +*> F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +*> F LOGICAL FLAG, T TO STOP ON FAILURES. +*> T LOGICAL FLAG, T TO TEST ERROR EXITS. +*> 16.0 THRESHOLD VALUE OF TEST RATIO +*> 6 NUMBER OF VALUES OF N +*> 0 1 2 3 5 9 VALUES OF N +*> 3 NUMBER OF VALUES OF ALPHA +*> 0.0 1.0 0.7 VALUES OF ALPHA +*> 3 NUMBER OF VALUES OF BETA +*> 0.0 1.0 1.3 VALUES OF BETA +*> DGEMM T PUT F FOR NO TEST. SAME COLUMNS. +*> DSYMM T PUT F FOR NO TEST. SAME COLUMNS. +*> DTRMM T PUT F FOR NO TEST. SAME COLUMNS. +*> DTRSM T PUT F FOR NO TEST. SAME COLUMNS. +*> DSYRK T PUT F FOR NO TEST. SAME COLUMNS. +*> DSYR2K T PUT F FOR NO TEST. SAME COLUMNS. +*> +*> Further Details +*> =============== +*> +*> See: +*> +*> Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. +*> A Set of Level 3 Basic Linear Algebra Subprograms. +*> +*> Technical Memorandum No.88 (Revision 1), Mathematics and +*> Computer Science Division, Argonne National Laboratory, 9700 +*> South Cass Avenue, Argonne, Illinois 60439, US. +*> +*> -- Written on 8-February-1989. +*> Jack Dongarra, Argonne National Laboratory. +*> Iain Duff, AERE Harwell. +*> Jeremy Du Croz, Numerical Algorithms Group Ltd. +*> Sven Hammarling, Numerical Algorithms Group Ltd. +*> +*> 10-9-00: Change STATUS='NEW' to 'UNKNOWN' so that the testers +*> can be run multiple times without deleting generated +*> output files (susan) +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date April 2012 +* +*> \ingroup double_blas_testing +* +* ===================================================================== PROGRAM DBLAT3 * -* Test program for the DOUBLE PRECISION Level 3 Blas. +* -- Reference BLAS test routine (version 3.7.0) -- +* -- Reference BLAS is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* April 2012 * -* The program must be driven by a short data file. The first 14 records -* of the file are read using list-directed input, the last 6 records -* are read using the format ( A6, L2 ). An annotated example of a data -* file can be obtained by deleting the first 3 characters from the -* following 20 lines: -* 'DBLAT3.SUMM' NAME OF SUMMARY OUTPUT FILE -* 6 UNIT NUMBER OF SUMMARY FILE -* 'DBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE -* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) -* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO STOP ON FAILURES. -* T LOGICAL FLAG, T TO TEST ERROR EXITS. -* 16.0 THRESHOLD VALUE OF TEST RATIO -* 6 NUMBER OF VALUES OF N -* 0 1 2 3 5 9 VALUES OF N -* 3 NUMBER OF VALUES OF ALPHA -* 0.0 1.0 0.7 VALUES OF ALPHA -* 3 NUMBER OF VALUES OF BETA -* 0.0 1.0 1.3 VALUES OF BETA -* DGEMM T PUT F FOR NO TEST. SAME COLUMNS. -* DSYMM T PUT F FOR NO TEST. SAME COLUMNS. -* DTRMM T PUT F FOR NO TEST. SAME COLUMNS. -* DTRSM T PUT F FOR NO TEST. SAME COLUMNS. -* DSYRK T PUT F FOR NO TEST. SAME COLUMNS. -* DSYR2K T PUT F FOR NO TEST. SAME COLUMNS. -* -* See: -* -* Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. -* A Set of Level 3 Basic Linear Algebra Subprograms. -* -* Technical Memorandum No.88 (Revision 1), Mathematics and -* Computer Science Division, Argonne National Laboratory, 9700 -* South Cass Avenue, Argonne, Illinois 60439, US. -* -* -- Written on 8-February-1989. -* Jack Dongarra, Argonne National Laboratory. -* Iain Duff, AERE Harwell. -* Jeremy Du Croz, Numerical Algorithms Group Ltd. -* Sven Hammarling, Numerical Algorithms Group Ltd. +* ===================================================================== * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 5 ) INTEGER NSUBS PARAMETER ( NSUBS = 6 ) - DOUBLE PRECISION ZERO, HALF, ONE - PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) INTEGER NMAX PARAMETER ( NMAX = 65 ) INTEGER NIDMAX, NALMAX, NBEMAX @@ -96,7 +142,7 @@ * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT - OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) + OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. @@ -105,7 +151,7 @@ READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN - OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI @@ -182,14 +228,7 @@ * * Compute EPS (the machine precision). * - EPS = ONE - 70 CONTINUE - IF( DDIFF( ONE + EPS, ONE ).EQ.ZERO ) - $ GO TO 80 - EPS = HALF*EPS - GO TO 70 - 80 CONTINUE - EPS = EPS + EPS + EPS = EPSILON(ZERO) WRITE( NOUT, FMT = 9998 )EPS * * Check the reliability of DMMCH using exact data. @@ -1802,7 +1841,7 @@ * * Tests the error exits from the Level 3 Blas. * Requires a special version of the error-handling routine XERBLA. -* ALPHA, BETA, A, B and C should not need to be defined. +* A, B and C should not need to be defined. * * Auxiliary routine for test program for Level 3 Blas. * @@ -1812,12 +1851,18 @@ * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * +* 3-19-92: Initialize ALPHA and BETA (eca) +* 3-19-92: Fix argument 12 in calls to SSYMM with INFOT = 9 (eca) +* * .. Scalar Arguments .. INTEGER ISNUM, NOUT CHARACTER*6 SRNAMT * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK +* .. Parameters .. + DOUBLE PRECISION ONE, TWO + PARAMETER ( ONE = 1.0D0, TWO = 2.0D0 ) * .. Local Scalars .. DOUBLE PRECISION ALPHA, BETA * .. Local Arrays .. @@ -1834,6 +1879,12 @@ * LERR is set to .TRUE. by the special version of XERBLA each time * it is called, and is then tested and re-set by CHKXER. LERR = .FALSE. +* +* Initialize ALPHA and BETA. +* + ALPHA = ONE + BETA = TWO +* GO TO ( 10, 20, 30, 40, 50, 60 )ISNUM 10 INFOT = 1 CALL DGEMM( '/', 'N', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) @@ -1963,16 +2014,16 @@ CALL DSYMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL DSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL DSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL DSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL DSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL DSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL DSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL DSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL DSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL DSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) @@ -2660,7 +2711,6 @@ 50 CONTINUE END IF * - 60 CONTINUE LDERES = .TRUE. GO TO 80 70 CONTINUE diff --git a/test/sblat2.f b/test/sblat2.f index a1074be52..56ead8640 100644 --- a/test/sblat2.f +++ b/test/sblat2.f @@ -1,75 +1,121 @@ +*> \brief \b SBLAT2 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* PROGRAM SBLAT2 +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> Test program for the REAL Level 2 Blas. +*> +*> The program must be driven by a short data file. The first 18 records +*> of the file are read using list-directed input, the last 16 records +*> are read using the format ( A6, L2 ). An annotated example of a data +*> file can be obtained by deleting the first 3 characters from the +*> following 34 lines: +*> 'sblat2.out' NAME OF SUMMARY OUTPUT FILE +*> 6 UNIT NUMBER OF SUMMARY FILE +*> 'SBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE +*> -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +*> F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +*> F LOGICAL FLAG, T TO STOP ON FAILURES. +*> T LOGICAL FLAG, T TO TEST ERROR EXITS. +*> 16.0 THRESHOLD VALUE OF TEST RATIO +*> 6 NUMBER OF VALUES OF N +*> 0 1 2 3 5 9 VALUES OF N +*> 4 NUMBER OF VALUES OF K +*> 0 1 2 4 VALUES OF K +*> 4 NUMBER OF VALUES OF INCX AND INCY +*> 1 2 -1 -2 VALUES OF INCX AND INCY +*> 3 NUMBER OF VALUES OF ALPHA +*> 0.0 1.0 0.7 VALUES OF ALPHA +*> 3 NUMBER OF VALUES OF BETA +*> 0.0 1.0 0.9 VALUES OF BETA +*> SGEMV T PUT F FOR NO TEST. SAME COLUMNS. +*> SGBMV T PUT F FOR NO TEST. SAME COLUMNS. +*> SSYMV T PUT F FOR NO TEST. SAME COLUMNS. +*> SSBMV T PUT F FOR NO TEST. SAME COLUMNS. +*> SSPMV T PUT F FOR NO TEST. SAME COLUMNS. +*> STRMV T PUT F FOR NO TEST. SAME COLUMNS. +*> STBMV T PUT F FOR NO TEST. SAME COLUMNS. +*> STPMV T PUT F FOR NO TEST. SAME COLUMNS. +*> STRSV T PUT F FOR NO TEST. SAME COLUMNS. +*> STBSV T PUT F FOR NO TEST. SAME COLUMNS. +*> STPSV T PUT F FOR NO TEST. SAME COLUMNS. +*> SGER T PUT F FOR NO TEST. SAME COLUMNS. +*> SSYR T PUT F FOR NO TEST. SAME COLUMNS. +*> SSPR T PUT F FOR NO TEST. SAME COLUMNS. +*> SSYR2 T PUT F FOR NO TEST. SAME COLUMNS. +*> SSPR2 T PUT F FOR NO TEST. SAME COLUMNS. +*> +*> Further Details +*> =============== +*> +*> See: +*> +*> Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. +*> An extended set of Fortran Basic Linear Algebra Subprograms. +*> +*> Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics +*> and Computer Science Division, Argonne National Laboratory, +*> 9700 South Cass Avenue, Argonne, Illinois 60439, US. +*> +*> Or +*> +*> NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms +*> Group Ltd., NAG Central Office, 256 Banbury Road, Oxford +*> OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st +*> Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. +*> +*> +*> -- Written on 10-August-1987. +*> Richard Hanson, Sandia National Labs. +*> Jeremy Du Croz, NAG Central Office. +*> +*> 10-9-00: Change STATUS='NEW' to 'UNKNOWN' so that the testers +*> can be run multiple times without deleting generated +*> output files (susan) +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date April 2012 +* +*> \ingroup single_blas_testing +* +* ===================================================================== PROGRAM SBLAT2 * -* Test program for the REAL Level 2 Blas. +* -- Reference BLAS test routine (version 3.7.0) -- +* -- Reference BLAS is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* April 2012 * -* The program must be driven by a short data file. The first 18 records -* of the file are read using list-directed input, the last 16 records -* are read using the format ( A6, L2 ). An annotated example of a data -* file can be obtained by deleting the first 3 characters from the -* following 34 lines: -* 'SBLAT2.SUMM' NAME OF SUMMARY OUTPUT FILE -* 6 UNIT NUMBER OF SUMMARY FILE -* 'SBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE -* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) -* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO STOP ON FAILURES. -* T LOGICAL FLAG, T TO TEST ERROR EXITS. -* 16.0 THRESHOLD VALUE OF TEST RATIO -* 6 NUMBER OF VALUES OF N -* 0 1 2 3 5 9 VALUES OF N -* 4 NUMBER OF VALUES OF K -* 0 1 2 4 VALUES OF K -* 4 NUMBER OF VALUES OF INCX AND INCY -* 1 2 -1 -2 VALUES OF INCX AND INCY -* 3 NUMBER OF VALUES OF ALPHA -* 0.0 1.0 0.7 VALUES OF ALPHA -* 3 NUMBER OF VALUES OF BETA -* 0.0 1.0 0.9 VALUES OF BETA -* SGEMV T PUT F FOR NO TEST. SAME COLUMNS. -* SGBMV T PUT F FOR NO TEST. SAME COLUMNS. -* SSYMV T PUT F FOR NO TEST. SAME COLUMNS. -* SSBMV T PUT F FOR NO TEST. SAME COLUMNS. -* SSPMV T PUT F FOR NO TEST. SAME COLUMNS. -* STRMV T PUT F FOR NO TEST. SAME COLUMNS. -* STBMV T PUT F FOR NO TEST. SAME COLUMNS. -* STPMV T PUT F FOR NO TEST. SAME COLUMNS. -* STRSV T PUT F FOR NO TEST. SAME COLUMNS. -* STBSV T PUT F FOR NO TEST. SAME COLUMNS. -* STPSV T PUT F FOR NO TEST. SAME COLUMNS. -* SGER T PUT F FOR NO TEST. SAME COLUMNS. -* SSYR T PUT F FOR NO TEST. SAME COLUMNS. -* SSPR T PUT F FOR NO TEST. SAME COLUMNS. -* SSYR2 T PUT F FOR NO TEST. SAME COLUMNS. -* SSPR2 T PUT F FOR NO TEST. SAME COLUMNS. -* -* See: -* -* Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. -* An extended set of Fortran Basic Linear Algebra Subprograms. -* -* Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics -* and Computer Science Division, Argonne National Laboratory, -* 9700 South Cass Avenue, Argonne, Illinois 60439, US. -* -* Or -* -* NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms -* Group Ltd., NAG Central Office, 256 Banbury Road, Oxford -* OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st -* Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. -* -* -* -- Written on 10-August-1987. -* Richard Hanson, Sandia National Labs. -* Jeremy Du Croz, NAG Central Office. +* ===================================================================== * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 5 ) INTEGER NSUBS PARAMETER ( NSUBS = 16 ) - REAL ZERO, HALF, ONE - PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0, ONE = 1.0 ) INTEGER NMAX, INCMAX PARAMETER ( NMAX = 65, INCMAX = 2 ) INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX @@ -121,7 +167,7 @@ * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT - OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) + OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. @@ -130,7 +176,7 @@ READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN - OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI @@ -235,14 +281,7 @@ * * Compute EPS (the machine precision). * - EPS = ONE - 90 CONTINUE - IF( SDIFF( ONE + EPS, ONE ).EQ.ZERO ) - $ GO TO 100 - EPS = HALF*EPS - GO TO 90 - 100 CONTINUE - EPS = EPS + EPS + EPS = EPSILON(ZERO) WRITE( NOUT, FMT = 9998 )EPS * * Check the reliability of SMVCH using exact data. @@ -2982,7 +3021,6 @@ 50 CONTINUE END IF * - 60 CONTINUE LSERES = .TRUE. GO TO 80 70 CONTINUE diff --git a/test/sblat3.f b/test/sblat3.f index 325a9eb92..66edac14e 100644 --- a/test/sblat3.f +++ b/test/sblat3.f @@ -1,55 +1,101 @@ +*> \brief \b SBLAT3 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* PROGRAM SBLAT3 +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> Test program for the REAL Level 3 Blas. +*> +*> The program must be driven by a short data file. The first 14 records +*> of the file are read using list-directed input, the last 6 records +*> are read using the format ( A6, L2 ). An annotated example of a data +*> file can be obtained by deleting the first 3 characters from the +*> following 20 lines: +*> 'sblat3.out' NAME OF SUMMARY OUTPUT FILE +*> 6 UNIT NUMBER OF SUMMARY FILE +*> 'SBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +*> -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +*> F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +*> F LOGICAL FLAG, T TO STOP ON FAILURES. +*> T LOGICAL FLAG, T TO TEST ERROR EXITS. +*> 16.0 THRESHOLD VALUE OF TEST RATIO +*> 6 NUMBER OF VALUES OF N +*> 0 1 2 3 5 9 VALUES OF N +*> 3 NUMBER OF VALUES OF ALPHA +*> 0.0 1.0 0.7 VALUES OF ALPHA +*> 3 NUMBER OF VALUES OF BETA +*> 0.0 1.0 1.3 VALUES OF BETA +*> SGEMM T PUT F FOR NO TEST. SAME COLUMNS. +*> SSYMM T PUT F FOR NO TEST. SAME COLUMNS. +*> STRMM T PUT F FOR NO TEST. SAME COLUMNS. +*> STRSM T PUT F FOR NO TEST. SAME COLUMNS. +*> SSYRK T PUT F FOR NO TEST. SAME COLUMNS. +*> SSYR2K T PUT F FOR NO TEST. SAME COLUMNS. +*> +*> Further Details +*> =============== +*> +*> See: +*> +*> Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. +*> A Set of Level 3 Basic Linear Algebra Subprograms. +*> +*> Technical Memorandum No.88 (Revision 1), Mathematics and +*> Computer Science Division, Argonne National Laboratory, 9700 +*> South Cass Avenue, Argonne, Illinois 60439, US. +*> +*> -- Written on 8-February-1989. +*> Jack Dongarra, Argonne National Laboratory. +*> Iain Duff, AERE Harwell. +*> Jeremy Du Croz, Numerical Algorithms Group Ltd. +*> Sven Hammarling, Numerical Algorithms Group Ltd. +*> +*> 10-9-00: Change STATUS='NEW' to 'UNKNOWN' so that the testers +*> can be run multiple times without deleting generated +*> output files (susan) +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date April 2012 +* +*> \ingroup single_blas_testing +* +* ===================================================================== PROGRAM SBLAT3 * -* Test program for the REAL Level 3 Blas. +* -- Reference BLAS test routine (version 3.7.0) -- +* -- Reference BLAS is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* April 2012 * -* The program must be driven by a short data file. The first 14 records -* of the file are read using list-directed input, the last 6 records -* are read using the format ( A6, L2 ). An annotated example of a data -* file can be obtained by deleting the first 3 characters from the -* following 20 lines: -* 'SBLAT3.SUMM' NAME OF SUMMARY OUTPUT FILE -* 6 UNIT NUMBER OF SUMMARY FILE -* 'SBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE -* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) -* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO STOP ON FAILURES. -* T LOGICAL FLAG, T TO TEST ERROR EXITS. -* 16.0 THRESHOLD VALUE OF TEST RATIO -* 6 NUMBER OF VALUES OF N -* 0 1 2 3 5 9 VALUES OF N -* 3 NUMBER OF VALUES OF ALPHA -* 0.0 1.0 0.7 VALUES OF ALPHA -* 3 NUMBER OF VALUES OF BETA -* 0.0 1.0 1.3 VALUES OF BETA -* SGEMM T PUT F FOR NO TEST. SAME COLUMNS. -* SSYMM T PUT F FOR NO TEST. SAME COLUMNS. -* STRMM T PUT F FOR NO TEST. SAME COLUMNS. -* STRSM T PUT F FOR NO TEST. SAME COLUMNS. -* SSYRK T PUT F FOR NO TEST. SAME COLUMNS. -* SSYR2K T PUT F FOR NO TEST. SAME COLUMNS. -* -* See: -* -* Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. -* A Set of Level 3 Basic Linear Algebra Subprograms. -* -* Technical Memorandum No.88 (Revision 1), Mathematics and -* Computer Science Division, Argonne National Laboratory, 9700 -* South Cass Avenue, Argonne, Illinois 60439, US. -* -* -- Written on 8-February-1989. -* Jack Dongarra, Argonne National Laboratory. -* Iain Duff, AERE Harwell. -* Jeremy Du Croz, Numerical Algorithms Group Ltd. -* Sven Hammarling, Numerical Algorithms Group Ltd. +* ===================================================================== * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 5 ) INTEGER NSUBS PARAMETER ( NSUBS = 6 ) - REAL ZERO, HALF, ONE - PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0, ONE = 1.0 ) INTEGER NMAX PARAMETER ( NMAX = 65 ) INTEGER NIDMAX, NALMAX, NBEMAX @@ -96,7 +142,7 @@ * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT - OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) + OPEN( NOUT, FILE = SUMMRY ) NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. @@ -105,7 +151,7 @@ READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN - OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + OPEN( NTRA, FILE = SNAPS ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI @@ -182,14 +228,7 @@ * * Compute EPS (the machine precision). * - EPS = ONE - 70 CONTINUE - IF( SDIFF( ONE + EPS, ONE ).EQ.ZERO ) - $ GO TO 80 - EPS = HALF*EPS - GO TO 70 - 80 CONTINUE - EPS = EPS + EPS + EPS = EPSILON(ZERO) WRITE( NOUT, FMT = 9998 )EPS * * Check the reliability of SMMCH using exact data. @@ -1802,7 +1841,7 @@ * * Tests the error exits from the Level 3 Blas. * Requires a special version of the error-handling routine XERBLA. -* ALPHA, BETA, A, B and C should not need to be defined. +* A, B and C should not need to be defined. * * Auxiliary routine for test program for Level 3 Blas. * @@ -1812,12 +1851,18 @@ * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * +* 3-19-92: Initialize ALPHA and BETA (eca) +* 3-19-92: Fix argument 12 in calls to SSYMM with INFOT = 9 (eca) +* * .. Scalar Arguments .. INTEGER ISNUM, NOUT CHARACTER*6 SRNAMT * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK +* .. Parameters .. + REAL ONE, TWO + PARAMETER ( ONE = 1.0E0, TWO = 2.0E0 ) * .. Local Scalars .. REAL ALPHA, BETA * .. Local Arrays .. @@ -1834,6 +1879,12 @@ * LERR is set to .TRUE. by the special version of XERBLA each time * it is called, and is then tested and re-set by CHKXER. LERR = .FALSE. +* +* Initialize ALPHA and BETA. +* + ALPHA = ONE + BETA = TWO +* GO TO ( 10, 20, 30, 40, 50, 60 )ISNUM 10 INFOT = 1 CALL SGEMM( '/', 'N', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) @@ -1963,16 +2014,16 @@ CALL SSYMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL SSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL SSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL SSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL SSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL SSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL SSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL SSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL SSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL SSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) @@ -2660,7 +2711,6 @@ 50 CONTINUE END IF * - 60 CONTINUE LSERES = .TRUE. GO TO 80 70 CONTINUE diff --git a/test/zblat1.f b/test/zblat1.f index 8b4b8d21e..2d7b88490 100644 --- a/test/zblat1.f +++ b/test/zblat1.f @@ -1,7 +1,49 @@ +*> \brief \b ZBLAT1 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* PROGRAM ZBLAT1 +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> Test program for the COMPLEX*16 Level 1 BLAS. +*> +*> Based upon the original BLAS test routine together with: +*> F06GAF Example Program Text +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date April 2012 +* +*> \ingroup complex16_blas_testing +* +* ===================================================================== PROGRAM ZBLAT1 -* Test program for the COMPLEX*16 Level 1 BLAS. -* Based upon the original BLAS test routine together with: -* F06GAF Example Program Text +* +* -- Reference BLAS test routine (version 3.7.0) -- +* -- Reference BLAS is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* April 2012 +* +* ===================================================================== +* * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) @@ -114,8 +156,8 @@ + (5.0D0,6.0D0), (5.0D0,6.0D0), (0.1D0,0.1D0), + (-0.6D0,0.1D0), (0.1D0,-0.3D0), (7.0D0,8.0D0), + (7.0D0,8.0D0), (7.0D0,8.0D0), (7.0D0,8.0D0), - + (7.0D0,8.0D0), (0.3D0,0.1D0), (0.1D0,0.4D0), - + (0.4D0,0.1D0), (0.1D0,0.2D0), (2.0D0,3.0D0), + + (7.0D0,8.0D0), (0.3D0,0.1D0), (0.5D0,0.0D0), + + (0.0D0,0.5D0), (0.0D0,0.2D0), (2.0D0,3.0D0), + (2.0D0,3.0D0), (2.0D0,3.0D0), (2.0D0,3.0D0)/ DATA ((CV(I,J,2),I=1,8),J=1,5)/(0.1D0,0.1D0), + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), @@ -129,10 +171,10 @@ + (3.0D0,6.0D0), (-0.6D0,0.1D0), (4.0D0,7.0D0), + (0.1D0,-0.3D0), (7.0D0,2.0D0), (7.0D0,2.0D0), + (7.0D0,2.0D0), (0.3D0,0.1D0), (5.0D0,8.0D0), - + (0.1D0,0.4D0), (6.0D0,9.0D0), (0.4D0,0.1D0), - + (8.0D0,3.0D0), (0.1D0,0.2D0), (9.0D0,4.0D0)/ - DATA STRUE2/0.0D0, 0.5D0, 0.6D0, 0.7D0, 0.7D0/ - DATA STRUE4/0.0D0, 0.7D0, 1.0D0, 1.3D0, 1.7D0/ + + (0.5D0,0.0D0), (6.0D0,9.0D0), (0.0D0,0.5D0), + + (8.0D0,3.0D0), (0.0D0,0.2D0), (9.0D0,4.0D0)/ + DATA STRUE2/0.0D0, 0.5D0, 0.6D0, 0.7D0, 0.8D0/ + DATA STRUE4/0.0D0, 0.7D0, 1.0D0, 1.3D0, 1.6D0/ DATA ((CTRUE5(I,J,1),I=1,8),J=1,5)/(0.1D0,0.1D0), + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), @@ -145,8 +187,8 @@ + (0.11D0,-0.03D0), (-0.17D0,0.46D0), + (-0.17D0,-0.19D0), (7.0D0,8.0D0), (7.0D0,8.0D0), + (7.0D0,8.0D0), (7.0D0,8.0D0), (7.0D0,8.0D0), - + (0.19D0,-0.17D0), (0.32D0,0.09D0), - + (0.23D0,-0.24D0), (0.18D0,0.01D0), + + (0.19D0,-0.17D0), (0.20D0,-0.35D0), + + (0.35D0,0.20D0), (0.14D0,0.08D0), + (2.0D0,3.0D0), (2.0D0,3.0D0), (2.0D0,3.0D0), + (2.0D0,3.0D0)/ DATA ((CTRUE5(I,J,2),I=1,8),J=1,5)/(0.1D0,0.1D0), @@ -162,9 +204,9 @@ + (-0.17D0,0.46D0), (4.0D0,7.0D0), + (-0.17D0,-0.19D0), (7.0D0,2.0D0), (7.0D0,2.0D0), + (7.0D0,2.0D0), (0.19D0,-0.17D0), (5.0D0,8.0D0), - + (0.32D0,0.09D0), (6.0D0,9.0D0), - + (0.23D0,-0.24D0), (8.0D0,3.0D0), - + (0.18D0,0.01D0), (9.0D0,4.0D0)/ + + (0.20D0,-0.35D0), (6.0D0,9.0D0), + + (0.35D0,0.20D0), (8.0D0,3.0D0), + + (0.14D0,0.08D0), (9.0D0,4.0D0)/ DATA ((CTRUE6(I,J,1),I=1,8),J=1,5)/(0.1D0,0.1D0), + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), @@ -177,8 +219,8 @@ + (0.03D0,0.03D0), (-0.18D0,0.03D0), + (0.03D0,-0.09D0), (7.0D0,8.0D0), (7.0D0,8.0D0), + (7.0D0,8.0D0), (7.0D0,8.0D0), (7.0D0,8.0D0), - + (0.09D0,0.03D0), (0.03D0,0.12D0), - + (0.12D0,0.03D0), (0.03D0,0.06D0), (2.0D0,3.0D0), + + (0.09D0,0.03D0), (0.15D0,0.00D0), + + (0.00D0,0.15D0), (0.00D0,0.06D0), (2.0D0,3.0D0), + (2.0D0,3.0D0), (2.0D0,3.0D0), (2.0D0,3.0D0)/ DATA ((CTRUE6(I,J,2),I=1,8),J=1,5)/(0.1D0,0.1D0), + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), @@ -193,8 +235,8 @@ + (-0.18D0,0.03D0), (4.0D0,7.0D0), + (0.03D0,-0.09D0), (7.0D0,2.0D0), (7.0D0,2.0D0), + (7.0D0,2.0D0), (0.09D0,0.03D0), (5.0D0,8.0D0), - + (0.03D0,0.12D0), (6.0D0,9.0D0), (0.12D0,0.03D0), - + (8.0D0,3.0D0), (0.03D0,0.06D0), (9.0D0,4.0D0)/ + + (0.15D0,0.00D0), (6.0D0,9.0D0), (0.00D0,0.15D0), + + (8.0D0,3.0D0), (0.00D0,0.06D0), (9.0D0,4.0D0)/ DATA ITRUE3/0, 1, 2, 2, 2/ * .. Executable Statements .. DO 60 INCX = 1, 2 @@ -529,7 +571,8 @@ * * .. Parameters .. INTEGER NOUT - PARAMETER (NOUT=6) + DOUBLE PRECISION ZERO + PARAMETER (NOUT=6, ZERO=0.0D0) * .. Scalar Arguments .. DOUBLE PRECISION SFAC INTEGER LEN @@ -552,7 +595,7 @@ * DO 40 I = 1, LEN SD = SCOMP(I) - STRUE(I) - IF (SDIFF(ABS(SSIZE(I))+ABS(SFAC*SD),ABS(SSIZE(I))).EQ.0.0D0) + IF (ABS(SFAC*SD) .LE. ABS(SSIZE(I))*EPSILON(ZERO)) + GO TO 40 * * HERE SCOMP(I) IS NOT CLOSE TO STRUE(I). diff --git a/test/zblat2.f b/test/zblat2.f index e65cdcc70..4a20ac567 100644 --- a/test/zblat2.f +++ b/test/zblat2.f @@ -1,68 +1,114 @@ +*> \brief \b ZBLAT2 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* PROGRAM ZBLAT2 +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> Test program for the COMPLEX*16 Level 2 Blas. +*> +*> The program must be driven by a short data file. The first 18 records +*> of the file are read using list-directed input, the last 17 records +*> are read using the format ( A6, L2 ). An annotated example of a data +*> file can be obtained by deleting the first 3 characters from the +*> following 35 lines: +*> 'zblat2.out' NAME OF SUMMARY OUTPUT FILE +*> 6 UNIT NUMBER OF SUMMARY FILE +*> 'CBLA2T.SNAP' NAME OF SNAPSHOT OUTPUT FILE +*> -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +*> F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +*> F LOGICAL FLAG, T TO STOP ON FAILURES. +*> T LOGICAL FLAG, T TO TEST ERROR EXITS. +*> 16.0 THRESHOLD VALUE OF TEST RATIO +*> 6 NUMBER OF VALUES OF N +*> 0 1 2 3 5 9 VALUES OF N +*> 4 NUMBER OF VALUES OF K +*> 0 1 2 4 VALUES OF K +*> 4 NUMBER OF VALUES OF INCX AND INCY +*> 1 2 -1 -2 VALUES OF INCX AND INCY +*> 3 NUMBER OF VALUES OF ALPHA +*> (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +*> 3 NUMBER OF VALUES OF BETA +*> (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +*> ZGEMV T PUT F FOR NO TEST. SAME COLUMNS. +*> ZGBMV T PUT F FOR NO TEST. SAME COLUMNS. +*> ZHEMV T PUT F FOR NO TEST. SAME COLUMNS. +*> ZHBMV T PUT F FOR NO TEST. SAME COLUMNS. +*> ZHPMV T PUT F FOR NO TEST. SAME COLUMNS. +*> ZTRMV T PUT F FOR NO TEST. SAME COLUMNS. +*> ZTBMV T PUT F FOR NO TEST. SAME COLUMNS. +*> ZTPMV T PUT F FOR NO TEST. SAME COLUMNS. +*> ZTRSV T PUT F FOR NO TEST. SAME COLUMNS. +*> ZTBSV T PUT F FOR NO TEST. SAME COLUMNS. +*> ZTPSV T PUT F FOR NO TEST. SAME COLUMNS. +*> ZGERC T PUT F FOR NO TEST. SAME COLUMNS. +*> ZGERU T PUT F FOR NO TEST. SAME COLUMNS. +*> ZHER T PUT F FOR NO TEST. SAME COLUMNS. +*> ZHPR T PUT F FOR NO TEST. SAME COLUMNS. +*> ZHER2 T PUT F FOR NO TEST. SAME COLUMNS. +*> ZHPR2 T PUT F FOR NO TEST. SAME COLUMNS. +*> +*> Further Details +*> =============== +*> +*> See: +*> +*> Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. +*> An extended set of Fortran Basic Linear Algebra Subprograms. +*> +*> Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics +*> and Computer Science Division, Argonne National Laboratory, +*> 9700 South Cass Avenue, Argonne, Illinois 60439, US. +*> +*> Or +*> +*> NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms +*> Group Ltd., NAG Central Office, 256 Banbury Road, Oxford +*> OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st +*> Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. +*> +*> +*> -- Written on 10-August-1987. +*> Richard Hanson, Sandia National Labs. +*> Jeremy Du Croz, NAG Central Office. +*> +*> 10-9-00: Change STATUS='NEW' to 'UNKNOWN' so that the testers +*> can be run multiple times without deleting generated +*> output files (susan) +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date April 2012 +* +*> \ingroup complex16_blas_testing +* +* ===================================================================== PROGRAM ZBLAT2 * -* Test program for the COMPLEX*16 Level 2 Blas. +* -- Reference BLAS test routine (version 3.7.0) -- +* -- Reference BLAS is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* April 2012 * -* The program must be driven by a short data file. The first 18 records -* of the file are read using list-directed input, the last 17 records -* are read using the format ( A6, L2 ). An annotated example of a data -* file can be obtained by deleting the first 3 characters from the -* following 35 lines: -* 'ZBLAT2.SUMM' NAME OF SUMMARY OUTPUT FILE -* 6 UNIT NUMBER OF SUMMARY FILE -* 'CBLA2T.SNAP' NAME OF SNAPSHOT OUTPUT FILE -* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) -* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO STOP ON FAILURES. -* T LOGICAL FLAG, T TO TEST ERROR EXITS. -* 16.0 THRESHOLD VALUE OF TEST RATIO -* 6 NUMBER OF VALUES OF N -* 0 1 2 3 5 9 VALUES OF N -* 4 NUMBER OF VALUES OF K -* 0 1 2 4 VALUES OF K -* 4 NUMBER OF VALUES OF INCX AND INCY -* 1 2 -1 -2 VALUES OF INCX AND INCY -* 3 NUMBER OF VALUES OF ALPHA -* (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA -* 3 NUMBER OF VALUES OF BETA -* (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA -* ZGEMV T PUT F FOR NO TEST. SAME COLUMNS. -* ZGBMV T PUT F FOR NO TEST. SAME COLUMNS. -* ZHEMV T PUT F FOR NO TEST. SAME COLUMNS. -* ZHBMV T PUT F FOR NO TEST. SAME COLUMNS. -* ZHPMV T PUT F FOR NO TEST. SAME COLUMNS. -* ZTRMV T PUT F FOR NO TEST. SAME COLUMNS. -* ZTBMV T PUT F FOR NO TEST. SAME COLUMNS. -* ZTPMV T PUT F FOR NO TEST. SAME COLUMNS. -* ZTRSV T PUT F FOR NO TEST. SAME COLUMNS. -* ZTBSV T PUT F FOR NO TEST. SAME COLUMNS. -* ZTPSV T PUT F FOR NO TEST. SAME COLUMNS. -* ZGERC T PUT F FOR NO TEST. SAME COLUMNS. -* ZGERU T PUT F FOR NO TEST. SAME COLUMNS. -* ZHER T PUT F FOR NO TEST. SAME COLUMNS. -* ZHPR T PUT F FOR NO TEST. SAME COLUMNS. -* ZHER2 T PUT F FOR NO TEST. SAME COLUMNS. -* ZHPR2 T PUT F FOR NO TEST. SAME COLUMNS. -* -* See: -* -* Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. -* An extended set of Fortran Basic Linear Algebra Subprograms. -* -* Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics -* and Computer Science Division, Argonne National Laboratory, -* 9700 South Cass Avenue, Argonne, Illinois 60439, US. -* -* Or -* -* NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms -* Group Ltd., NAG Central Office, 256 Banbury Road, Oxford -* OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st -* Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. -* -* -* -- Written on 10-August-1987. -* Richard Hanson, Sandia National Labs. -* Jeremy Du Croz, NAG Central Office. +* ===================================================================== * * .. Parameters .. INTEGER NIN @@ -72,8 +118,8 @@ COMPLEX*16 ZERO, ONE PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), $ ONE = ( 1.0D0, 0.0D0 ) ) - DOUBLE PRECISION RZERO, RHALF, RONE - PARAMETER ( RZERO = 0.0D0, RHALF = 0.5D0, RONE = 1.0D0 ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) INTEGER NMAX, INCMAX PARAMETER ( NMAX = 65, INCMAX = 2 ) INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX @@ -127,7 +173,7 @@ * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT - OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) + OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. @@ -136,7 +182,7 @@ READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN - OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI @@ -241,14 +287,7 @@ * * Compute EPS (the machine precision). * - EPS = RONE - 90 CONTINUE - IF( DDIFF( RONE + EPS, RONE ).EQ.RZERO ) - $ GO TO 100 - EPS = RHALF*EPS - GO TO 90 - 100 CONTINUE - EPS = EPS + EPS + EPS = EPSILON(RZERO) WRITE( NOUT, FMT = 9998 )EPS * * Check the reliability of ZMVCH using exact data. @@ -3087,7 +3126,6 @@ 50 CONTINUE END IF * - 60 CONTINUE LZERES = .TRUE. GO TO 80 70 CONTINUE diff --git a/test/zblat3.f b/test/zblat3.f index f03b1a617..0e38334e9 100644 --- a/test/zblat3.f +++ b/test/zblat3.f @@ -1,50 +1,97 @@ +*> \brief \b ZBLAT3 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* PROGRAM ZBLAT3 +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> Test program for the COMPLEX*16 Level 3 Blas. +*> +*> The program must be driven by a short data file. The first 14 records +*> of the file are read using list-directed input, the last 9 records +*> are read using the format ( A6, L2 ). An annotated example of a data +*> file can be obtained by deleting the first 3 characters from the +*> following 23 lines: +*> 'zblat3.out' NAME OF SUMMARY OUTPUT FILE +*> 6 UNIT NUMBER OF SUMMARY FILE +*> 'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +*> -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +*> F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +*> F LOGICAL FLAG, T TO STOP ON FAILURES. +*> T LOGICAL FLAG, T TO TEST ERROR EXITS. +*> 16.0 THRESHOLD VALUE OF TEST RATIO +*> 6 NUMBER OF VALUES OF N +*> 0 1 2 3 5 9 VALUES OF N +*> 3 NUMBER OF VALUES OF ALPHA +*> (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +*> 3 NUMBER OF VALUES OF BETA +*> (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +*> ZGEMM T PUT F FOR NO TEST. SAME COLUMNS. +*> ZHEMM T PUT F FOR NO TEST. SAME COLUMNS. +*> ZSYMM T PUT F FOR NO TEST. SAME COLUMNS. +*> ZTRMM T PUT F FOR NO TEST. SAME COLUMNS. +*> ZTRSM T PUT F FOR NO TEST. SAME COLUMNS. +*> ZHERK T PUT F FOR NO TEST. SAME COLUMNS. +*> ZSYRK T PUT F FOR NO TEST. SAME COLUMNS. +*> ZHER2K T PUT F FOR NO TEST. SAME COLUMNS. +*> ZSYR2K T PUT F FOR NO TEST. SAME COLUMNS. +*> +*> +*> Further Details +*> =============== +*> +*> See: +*> +*> Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. +*> A Set of Level 3 Basic Linear Algebra Subprograms. +*> +*> Technical Memorandum No.88 (Revision 1), Mathematics and +*> Computer Science Division, Argonne National Laboratory, 9700 +*> South Cass Avenue, Argonne, Illinois 60439, US. +*> +*> -- Written on 8-February-1989. +*> Jack Dongarra, Argonne National Laboratory. +*> Iain Duff, AERE Harwell. +*> Jeremy Du Croz, Numerical Algorithms Group Ltd. +*> Sven Hammarling, Numerical Algorithms Group Ltd. +*> +*> 10-9-00: Change STATUS='NEW' to 'UNKNOWN' so that the testers +*> can be run multiple times without deleting generated +*> output files (susan) +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date April 2012 +* +*> \ingroup complex16_blas_testing +* +* ===================================================================== PROGRAM ZBLAT3 * -* Test program for the COMPLEX*16 Level 3 Blas. +* -- Reference BLAS test routine (version 3.7.0) -- +* -- Reference BLAS is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* April 2012 * -* The program must be driven by a short data file. The first 14 records -* of the file are read using list-directed input, the last 9 records -* are read using the format ( A6, L2 ). An annotated example of a data -* file can be obtained by deleting the first 3 characters from the -* following 23 lines: -* 'ZBLAT3.SUMM' NAME OF SUMMARY OUTPUT FILE -* 6 UNIT NUMBER OF SUMMARY FILE -* 'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE -* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) -* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO STOP ON FAILURES. -* T LOGICAL FLAG, T TO TEST ERROR EXITS. -* 16.0 THRESHOLD VALUE OF TEST RATIO -* 6 NUMBER OF VALUES OF N -* 0 1 2 3 5 9 VALUES OF N -* 3 NUMBER OF VALUES OF ALPHA -* (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA -* 3 NUMBER OF VALUES OF BETA -* (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA -* ZGEMM T PUT F FOR NO TEST. SAME COLUMNS. -* ZHEMM T PUT F FOR NO TEST. SAME COLUMNS. -* ZSYMM T PUT F FOR NO TEST. SAME COLUMNS. -* ZTRMM T PUT F FOR NO TEST. SAME COLUMNS. -* ZTRSM T PUT F FOR NO TEST. SAME COLUMNS. -* ZHERK T PUT F FOR NO TEST. SAME COLUMNS. -* ZSYRK T PUT F FOR NO TEST. SAME COLUMNS. -* ZHER2K T PUT F FOR NO TEST. SAME COLUMNS. -* ZSYR2K T PUT F FOR NO TEST. SAME COLUMNS. -* -* See: -* -* Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. -* A Set of Level 3 Basic Linear Algebra Subprograms. -* -* Technical Memorandum No.88 (Revision 1), Mathematics and -* Computer Science Division, Argonne National Laboratory, 9700 -* South Cass Avenue, Argonne, Illinois 60439, US. -* -* -- Written on 8-February-1989. -* Jack Dongarra, Argonne National Laboratory. -* Iain Duff, AERE Harwell. -* Jeremy Du Croz, Numerical Algorithms Group Ltd. -* Sven Hammarling, Numerical Algorithms Group Ltd. +* ===================================================================== * * .. Parameters .. INTEGER NIN @@ -54,8 +101,8 @@ COMPLEX*16 ZERO, ONE PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), $ ONE = ( 1.0D0, 0.0D0 ) ) - DOUBLE PRECISION RZERO, RHALF, RONE - PARAMETER ( RZERO = 0.0D0, RHALF = 0.5D0, RONE = 1.0D0 ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) INTEGER NMAX PARAMETER ( NMAX = 65 ) INTEGER NIDMAX, NALMAX, NBEMAX @@ -104,7 +151,7 @@ * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT - OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) + OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. @@ -113,7 +160,7 @@ READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN - OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI @@ -190,14 +237,7 @@ * * Compute EPS (the machine precision). * - EPS = RONE - 70 CONTINUE - IF( DDIFF( RONE + EPS, RONE ).EQ.RZERO ) - $ GO TO 80 - EPS = RHALF*EPS - GO TO 70 - 80 CONTINUE - EPS = EPS + EPS + EPS = EPSILON(RZERO) WRITE( NOUT, FMT = 9998 )EPS * * Check the reliability of ZMMCH using exact data. @@ -1303,8 +1343,6 @@ NC = 0 RESET = .TRUE. ERRMAX = RZERO - RALS = RONE - RBETS = RONE * DO 100 IN = 1, NIDIM N = IDIM( IN ) @@ -1951,7 +1989,7 @@ * * Tests the error exits from the Level 3 Blas. * Requires a special version of the error-handling routine XERBLA. -* ALPHA, RALPHA, BETA, RBETA, A, B and C should not need to be defined. +* A, B and C should not need to be defined. * * Auxiliary routine for test program for Level 3 Blas. * @@ -1961,12 +1999,20 @@ * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * +* 3-19-92: Initialize ALPHA, BETA, RALPHA, and RBETA (eca) +* 3-19-92: Fix argument 12 in calls to ZSYMM and ZHEMM +* with INFOT = 9 (eca) +* 10-9-00: Declared INTRINSIC DCMPLX (susan) +* * .. Scalar Arguments .. INTEGER ISNUM, NOUT CHARACTER*6 SRNAMT * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK +* .. Parameters .. + REAL ONE, TWO + PARAMETER ( ONE = 1.0D0, TWO = 2.0D0 ) * .. Local Scalars .. COMPLEX*16 ALPHA, BETA DOUBLE PRECISION RALPHA, RBETA @@ -1975,6 +2021,8 @@ * .. External Subroutines .. EXTERNAL ZGEMM, ZHEMM, ZHER2K, ZHERK, CHKXER, ZSYMM, $ ZSYR2K, ZSYRK, ZTRMM, ZTRSM +* .. Intrinsic Functions .. + INTRINSIC DCMPLX * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Executable Statements .. @@ -1984,6 +2032,14 @@ * LERR is set to .TRUE. by the special version of XERBLA each time * it is called, and is then tested and re-set by CHKXER. LERR = .FALSE. +* +* Initialize ALPHA, BETA, RALPHA, and RBETA. +* + ALPHA = DCMPLX( ONE, -ONE ) + BETA = DCMPLX( TWO, -TWO ) + RALPHA = ONE + RBETA = TWO +* GO TO ( 10, 20, 30, 40, 50, 60, 70, 80, $ 90 )ISNUM 10 INFOT = 1 @@ -2210,16 +2266,16 @@ CALL ZHEMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL ZHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL ZHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL ZHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL ZHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL ZHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL ZHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL ZHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL ZHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL ZHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) @@ -2277,16 +2333,16 @@ CALL ZSYMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL ZSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL ZSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL ZSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL ZSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL ZSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL ZSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL ZSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL ZSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL ZSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) @@ -3276,7 +3332,6 @@ 50 CONTINUE END IF * - 60 CONTINUE LZERES = .TRUE. GO TO 80 70 CONTINUE diff --git a/utest/Makefile b/utest/Makefile index 0b9892411..31d4ccf00 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -34,6 +34,9 @@ endif ifeq ($(C_COMPILER), PGI) OBJS = utest_main2.o endif +ifeq ($(OSNAME), AIX) +OBJS = utest_main2.o +endif all : run_test