diff --git a/.travis.yml b/.travis.yml index 8657b64f4..85a57f6e3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,33 +1,38 @@ # XXX: Precise is already deprecated, new default is Trusty. # https://blog.travis-ci.com/2017-07-11-trusty-as-default-linux-is-coming -dist: precise +dist: focal sudo: true language: c matrix: include: - &test-ubuntu - os: linux +# os: linux compiler: gcc addons: apt: packages: - gfortran +# before_script: &common-before +# - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" +# script: +# - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE +# - make -C test $COMMON_FLAGS $BTYPE +# - make -C ctest $COMMON_FLAGS $BTYPE +# - make -C utest $COMMON_FLAGS $BTYPE +# env: +# - TARGET_BOX=LINUX64 +# - BTYPE="BINARY=64" +# +# - <<: *test-ubuntu + os: linux-ppc64le before_script: &common-before - - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" + - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32" script: - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE - make -C test $COMMON_FLAGS $BTYPE - make -C ctest $COMMON_FLAGS $BTYPE - make -C utest $COMMON_FLAGS $BTYPE - env: - - TARGET_BOX=LINUX64 - - BTYPE="BINARY=64" - - - <<: *test-ubuntu - os: linux-ppc64le - before_script: - - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32" env: # for matrix annotation only - TARGET_BOX=PPC64LE_LINUX diff --git a/CMakeLists.txt b/CMakeLists.txt index 0330b2ce7..a18a7adc3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 17.dev) +set(OpenBLAS_PATCH_VERSION 18.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions @@ -132,7 +132,7 @@ endif () if (BUILD_BFLOAT16) message(STATUS "Building Half Precision") - list(APPEND FLOAT_TYPES "BFLOAT16") # defines nothing + # list(APPEND FLOAT_TYPES "BFLOAT16") # defines nothing endif () if (NOT DEFINED CORE OR "${CORE}" STREQUAL "UNKNOWN") diff --git a/Changelog.txt b/Changelog.txt index ee0484e2b..59fe1d45e 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,47 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.18 + 02-Oct-2021 + +general: + - when the build-time number of preconfigured threads is exceeded + at runtime (typically by an external program calling BLAS functions + from a larger number of threads in parallel), OpenBLAS will now + allocate an auxiliary control structure for up to 512 additional + threads instead of aborting + - added support for Loongson's LoongArch64 cpu architecture + - fixed building OpenBLAS with CMAKE and -DBUILD_BFLOAT16=ON + - added support for building OpenBLAS as a CMAKE subproject + - added support for building for Windows/ARM64 targets with clang + - improved support for building with the IBM xlf compiler + - imported Reference-LAPACK PR 625 (out-of-bounds reads in ?LARRV) + - imported Reference-LAPACK PR 597 for testsuite compatibility with + LLVM's libomp + +x86_64: + - added SkylakeX S/DGEMM kernels for small problem sizes (M*N*K<=1000000) + - added optimized SBGEMM for Intel Cooper Lake + - reinstated the performance patch for AVX512 SGEMV_T with a proper fix + - added a workaround for a gcc11 tree-vectorizer bug that caused spurious + failures in the test programs for complex BLAS3 when compiling at -O3 + (the default for cmake "release" builds) + - added support for runtime cpu count detection under Haiku OS + - worked around a long-standing miscompilation issue of the Haswell DGEMV_T + kernel with gcc that could produce NaN output in some corner cases + +POWER: + - improved performance of DASUM on POWER10 + +ARMV8: + - fixed crashes (use of reserved register x18) on Apple M1 under OSX + - fixed building with gcc releases earlier than 5.1 + +MIPS: + - fixed building under BSD + +MIPS64: + - fixed building under BSD + ==================================================================== Version 0.3.17 15-Jul-2021 diff --git a/Makefile b/Makefile index 555d1c467..49fd57ff2 100644 --- a/Makefile +++ b/Makefile @@ -269,7 +269,7 @@ prof_lapack : lapack_prebuild lapack_prebuild : ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) -@echo "FC = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc - -@echo "FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "override FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "FFLAGS_DRV = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc diff --git a/Makefile.power b/Makefile.power index 946f55232..28a0bae08 100644 --- a/Makefile.power +++ b/Makefile.power @@ -12,9 +12,13 @@ endif ifeq ($(CORE), POWER10) ifneq ($(C_COMPILER), PGI) CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math +ifeq ($(F_COMPILER), IBM) +FCOMMON_OPT += -O2 -qrecur -qnosave +else FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math endif endif +endif ifeq ($(CORE), POWER9) ifneq ($(C_COMPILER), PGI) @@ -33,7 +37,11 @@ else CCOMMON_OPT += -fast -Mvect=simd -Mcache_align endif ifneq ($(F_COMPILER), PGI) +ifeq ($(F_COMPILER), IBM) +FCOMMON_OPT += -O2 -qrecur -qnosave +else FCOMMON_OPT += -O2 -frecursive -fno-fast-math +endif ifeq ($(C_COMPILER), GCC) ifneq ($(GCCVERSIONGT4), 1) $(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended) @@ -57,7 +65,11 @@ CCOMMON_OPT += -fast -Mvect=simd -Mcache_align endif ifneq ($(F_COMPILER), PGI) ifeq ($(OSNAME), AIX) +ifeq ($(F_COMPILER), IBM) +FCOMMON_OPT += -O2 -qrecur -qnosave +else FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math +endif else FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math endif diff --git a/Makefile.rule b/Makefile.rule index 7c04a3101..500b7c44f 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.17.dev +VERSION = 0.3.18.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library diff --git a/Makefile.system b/Makefile.system index 20db80d07..bd2164d02 100644 --- a/Makefile.system +++ b/Makefile.system @@ -16,6 +16,8 @@ else HOSTARCH = $(ARCH) endif +HAVE_GAS := $(shell as -v < /dev/null 2>&1 | grep GNU 2>&1 >/dev/null) + # Catch conflicting usage of ARCH in some BSD environments ifeq ($(ARCH), amd64) override ARCH=x86_64 @@ -33,6 +35,10 @@ else ifeq ($(ARCH), armv7) override ARCH=arm else ifeq ($(ARCH), aarch64) override ARCH=arm64 +else ifeq ($(ARCH), mipsel) +override ARCH=mips +else ifeq ($(ARCH), mips64el) +override ARCH=mips64 else ifeq ($(ARCH), zarch) override ARCH=zarch endif @@ -303,7 +309,7 @@ else SMP = 1 endif else -ifeq ($(NUM_THREAD), 1) +ifeq ($(NUM_THREADS), 1) SMP = else SMP = 1 diff --git a/README.md b/README.md index 88a5a5035..6ce85e08e 100644 --- a/README.md +++ b/README.md @@ -128,6 +128,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th - **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64. - **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64. - **Intel Skylake-X**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64. +- **Intel Cooper Lake**: as Skylake-X with improved BFLOAT16 support. - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar) - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations. @@ -153,6 +154,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th - **ARMv8**: Basic ARMV8 with small caches, optimized Level-3 and Level-2 BLAS - **Cortex-A53**: same as ARMV8 (different cpu specifications) +- **Cortex-A55**: same as ARMV8 (different cpu specifications) - **Cortex A57**: Optimized Level-3 and Level-2 functions - **Cortex A72**: same as A57 ( different cpu specifications) - **Cortex A73**: same as A57 (different cpu specifications) @@ -178,10 +180,11 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th #### RISC-V -- **C910V**: Optimized Leve-3 BLAS (real) and Level-1,2 by RISC-V Vector extension 0.7.1. +- **C910V**: Optimized Level-3 BLAS (real) and Level-1,2 by RISC-V Vector extension 0.7.1. ```sh make HOSTCC=gcc TARGET=C910V CC=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran ``` + (also known to work on C906) ### Support for multiple targets in a single library diff --git a/azure-pipelines.yml b/azure-pipelines.yml index a0ca2e03d..617692191 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -19,7 +19,7 @@ jobs: # of gcc / glibc - job: manylinux1_gcc pool: - vmImage: 'ubuntu-16.04' + vmImage: 'ubuntu-latest' steps: - script: | echo "FROM quay.io/pypa/manylinux1_x86_64 @@ -35,7 +35,7 @@ jobs: displayName: Run manylinux1 docker build - job: Intel_SDE_skx pool: - vmImage: 'ubuntu-16.04' + vmImage: 'ubuntu-latest' steps: - script: | # at the time of writing the available Azure Ubuntu vm image @@ -213,8 +213,9 @@ jobs: vmImage: 'ubuntu-latest' steps: - script: | - wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \ - && echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1 + wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.13.1/alpine-chroot-install \ + && echo '7c7e3fa378e69aecc7f5f01bbc759e5f0a9d9b74 alpine-chroot-install' | sha1sum -c \ + || exit 1 alpine() { /alpine/enter-chroot -u "$USER" "$@"; } sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo' alpine make DYNAMIC_ARCH=1 BINARY=64 diff --git a/cmake/fc.cmake b/cmake/fc.cmake index 631664569..f7aa4c5c9 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -104,7 +104,7 @@ endif () if (${F_COMPILER} STREQUAL "IBM") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_IBM") - # FCOMMON_OPT += -qarch=440 + set(FCOMMON_OPT "${FCOMMON_OPT} -qrecur") if (BINARY64) set(FCOMMON_OPT "${FCOMMON_OPT} -q64") if (INTERFACE64) diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake index 0c102bae5..09ca5eb57 100644 --- a/cmake/kernel.cmake +++ b/cmake/kernel.cmake @@ -134,6 +134,8 @@ if (BUILD_BFLOAT16) set(SHSWAPKERNEL ../arm/swap.c) set(TOBF16KERNEL ../x86_64/tobf16.c) set(BF16TOKERNEL ../x86_64/bf16to.c) + set(SBGEMVNKERNEL ../x86_64/sbgemv_n.c) + set(SBGEMVTKERNEL ../x86_64/sbgemv_t.c) endif () endmacro () diff --git a/cmake/system.cmake b/cmake/system.cmake index 7d2672998..f56ded966 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -469,6 +469,9 @@ endif() if (BUILD_COMPLEX16) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX16") endif() +if (BUILD_BFLOAT16) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_BFLOAT16") +endif() if(NOT MSVC) set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CCOMMON_OPT}") endif() diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 2a9399f7d..73a82d188 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -26,10 +26,12 @@ *****************************************************************************/ #include -#ifdef OS_DARWIN +#ifdef __APPLE__ #include int32_t value; size_t length=sizeof(value); +int64_t value64; +size_t length64=sizeof(value64); #endif #define CPU_UNKNOWN 0 @@ -212,9 +214,9 @@ int detect(void) } #else -#ifdef DARWIN +#ifdef __APPLE__ sysctlbyname("hw.cpufamily",&value,&length,NULL,0); - if (value ==131287967) return CPU_VORTEX; + if (value ==131287967|| value == 458787763 ) return CPU_VORTEX; #endif return CPU_ARMV8; #endif @@ -265,7 +267,7 @@ int n=0; printf("#define NUM_CORES %d\n",n); #endif -#ifdef DARWIN +#ifdef __APPLE__ sysctlbyname("hw.physicalcpu_max",&value,&length,NULL,0); printf("#define NUM_CORES %d\n",value); #endif @@ -420,17 +422,19 @@ void get_cpuconfig(void) printf("#define DTB_DEFAULT_ENTRIES 64 \n"); printf("#define DTB_SIZE 4096 \n"); break; -#ifdef DARWIN +#ifdef __APPLE__ case CPU_VORTEX: printf("#define VORTEX \n"); - sysctlbyname("hw.l1icachesize",&value,&length,NULL,0); - printf("#define L1_CODE_SIZE %d \n",value); - sysctlbyname("hw.cachelinesize",&value,&length,NULL,0); - printf("#define L1_CODE_LINESIZE %d \n",value); - sysctlbyname("hw.l1dcachesize",&value,&length,NULL,0); - printf("#define L1_DATA_SIZE %d \n",value); - sysctlbyname("hw.l2dcachesize",&value,&length,NULL,0); - printf("#define L2_SIZE %d \n",value); + sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0); + printf("#define L1_CODE_SIZE %lld \n",value64); + sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0); + printf("#define L1_CODE_LINESIZE %lld \n",value64); + sysctlbyname("hw.l1dcachesize",&value64,&length64,NULL,0); + printf("#define L1_DATA_SIZE %lld \n",value64); + sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0); + printf("#define L2_SIZE %lld \n",value64); + printf("#define DTB_DEFAULT_ENTRIES 64 \n"); + printf("#define DTB_SIZE 4096 \n"); break; #endif } diff --git a/driver/level2/CMakeLists.txt b/driver/level2/CMakeLists.txt index 61367e596..3e9964ab1 100644 --- a/driver/level2/CMakeLists.txt +++ b/driver/level2/CMakeLists.txt @@ -81,6 +81,7 @@ foreach (float_type ${FLOAT_TYPES}) GenerateNamedObjects("gbmv_thread.c" "TRANSA" "gbmv_thread_t" false "" "" false ${float_type}) endif () +# special defines for complex if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") foreach (u_source ${U_SOURCES}) @@ -197,6 +198,13 @@ foreach (float_type ${FLOAT_TYPES}) endif () endforeach () +if (BUILD_BFLOAT16) + if (USE_THREAD) + GenerateNamedObjects("sbgemv_thread.c" "" "gemv_thread_n" false "" "" false "BFLOAT16") + GenerateNamedObjects("sbgemv_thread.c" "TRANSA" "gemv_thread_t" false "" "" false "BFLOAT16") + endif () +endif () + if ( BUILD_COMPLEX AND NOT BUILD_SINGLE) if (USE_THREAD) GenerateNamedObjects("gemv_thread.c" "" "gemv_thread_n" false "" "" false "SINGLE") diff --git a/driver/level3/CMakeLists.txt b/driver/level3/CMakeLists.txt index 077862abc..75b25d039 100644 --- a/driver/level3/CMakeLists.txt +++ b/driver/level3/CMakeLists.txt @@ -12,6 +12,12 @@ foreach (GEMM_DEFINE ${GEMM_DEFINES}) if (USE_THREAD AND NOT USE_SIMPLE_THREADED_LEVEL3) GenerateNamedObjects("gemm.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm_thread_${GEMM_DEFINE_LC}" 0) endif () + if (BUILD_BFLOAT16) + GenerateNamedObjects("gemm.c" "${GEMM_DEFINE}" "gemm_${GEMM_DEFINE_LC}" 0 "" "" false "BFLOAT16") + if (USE_THREAD AND NOT USE_SIMPLE_THREADED_LEVEL3) + GenerateNamedObjects("gemm.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm_thread_${GEMM_DEFINE_LC}" 0 "" "" false "BFLOAT16") + endif () + endif () endforeach () if ( BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index d9c15b312..2847ea9ae 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -6,10 +6,6 @@ extern gotoblas_t gotoblas_POWER8; #if (!defined __GNUC__) || ( __GNUC__ >= 6) extern gotoblas_t gotoblas_POWER9; #endif -//#if (!defined __GNUC__) || ( __GNUC__ >= 11) \ -// || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2) -//#define HAVE_P10_SUPPORT 1 -//#endif #ifdef HAVE_P10_SUPPORT extern gotoblas_t gotoblas_POWER10; #endif diff --git a/driver/others/memory.c b/driver/others/memory.c index 48067923e..0185fa683 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2695,7 +2695,7 @@ static volatile struct { } memory[NUM_BUFFERS]; -static volatile struct newmemstruct +struct newmemstruct { BLASULONG lock; void *addr; diff --git a/driver/others/parameter.c b/driver/others/parameter.c index 36da13369..791e5dc27 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -524,6 +524,9 @@ void blas_set_parameter(void){ xgemm_p = ((xgemm_p + XGEMM_UNROLL_M - 1)/XGEMM_UNROLL_M) * XGEMM_UNROLL_M; #endif +#ifdef BUILD_BFLOAT16 + sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q * 4)) - 15) & ~15; +#endif sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15; dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15; cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15; @@ -629,7 +632,9 @@ void blas_set_parameter(void){ xgemm_p = 16 * (size + 1); #endif +#ifdef BUILD_BFLOAT16 sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q * 4)) - 15) & ~15; +#endif sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15; dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15; cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15; diff --git a/getarch.c b/getarch.c index 6e43616f7..094feaadd 100644 --- a/getarch.c +++ b/getarch.c @@ -313,6 +313,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" +#ifdef NO_AVX +#define SUBARCHITECTURE "NEHALEM" +#define ARCHCONFIG "-DNEHALEM " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2" +#define LIBNAME "nehalem" +#define CORENAME "NEHALEM" +#else #define SUBARCHITECTURE "SANDYBRIDGE" #define ARCHCONFIG "-DSANDYBRIDGE " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ @@ -322,12 +332,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define LIBNAME "sandybridge" #define CORENAME "SANDYBRIDGE" #endif +#endif #ifdef FORCE_HASWELL #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" #ifdef NO_AVX2 +#ifdef NO_AVX +#define SUBARCHITECTURE "NEHALEM" +#define ARCHCONFIG "-DNEHALEM " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2" +#define LIBNAME "nehalem" +#define CORENAME "NEHALEM" +#else #define SUBARCHITECTURE "SANDYBRIDGE" #define ARCHCONFIG "-DSANDYBRIDGE " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ @@ -336,6 +357,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX" #define LIBNAME "sandybridge" #define CORENAME "SANDYBRIDGE" +#endif #else #define SUBARCHITECTURE "HASWELL" #define ARCHCONFIG "-DHASWELL " \ @@ -350,10 +372,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef FORCE_SKYLAKEX -#ifdef NO_AVX512 #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" +#ifdef NO_AVX512 +#ifdef NO_AVX2 +#ifdef NO_AVX +#define SUBARCHITECTURE "NEHALEM" +#define ARCHCONFIG "-DNEHALEM " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2" +#define LIBNAME "nehalem" +#define CORENAME "NEHALEM" +#else +#define SUBARCHITECTURE "SANDYBRIDGE" +#define ARCHCONFIG "-DSANDYBRIDGE " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX" +#define LIBNAME "sandybridge" +#define CORENAME "SANDYBRIDGE" +#endif +#else #define SUBARCHITECTURE "HASWELL" #define ARCHCONFIG "-DHASWELL " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ @@ -363,10 +406,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3" #define LIBNAME "haswell" #define CORENAME "HASWELL" +#endif #else -#define FORCE -#define FORCE_INTEL -#define ARCHITECTURE "X86" #define SUBARCHITECTURE "SKYLAKEX" #define ARCHCONFIG "-DSKYLAKEX " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ @@ -380,10 +421,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef FORCE_COOPERLAKE -#ifdef NO_AVX512 #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" +#ifdef NO_AVX512 +#ifdef NO_AVX2 +#ifdef NO_AVX +#define SUBARCHITECTURE "NEHALEM" +#define ARCHCONFIG "-DNEHALEM " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2" +#define LIBNAME "nehalem" +#define CORENAME "NEHALEM" +#else +#define SUBARCHITECTURE "SANDYBRIDGE" +#define ARCHCONFIG "-DSANDYBRIDGE " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX" +#define LIBNAME "sandybridge" +#define CORENAME "SANDYBRIDGE" +#endif +#else #define SUBARCHITECTURE "HASWELL" #define ARCHCONFIG "-DHASWELL " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ @@ -393,10 +455,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3" #define LIBNAME "haswell" #define CORENAME "HASWELL" +#endif #else -#define FORCE -#define FORCE_INTEL -#define ARCHITECTURE "X86" #define SUBARCHITECTURE "COOPERLAKE" #define ARCHCONFIG "-DCOOPERLAKE " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ @@ -564,6 +624,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FORCE_INTEL #define ARCHITECTURE "X86" #ifdef NO_AVX2 +#ifdef NO_AVX +#define SUBARCHITECTURE "NEHALEM" +#define ARCHCONFIG "-DNEHALEM " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2" +#define LIBNAME "nehalem" +#define CORENAME "NEHALEM" +#else #define SUBARCHITECTURE "SANDYBRIDGE" #define ARCHCONFIG "-DSANDYBRIDGE " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ @@ -572,6 +642,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX" #define LIBNAME "sandybridge" #define CORENAME "SANDYBRIDGE" +#endif #else #define SUBARCHITECTURE "ZEN" #define ARCHCONFIG "-DZEN " \ diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index 5346ecadd..ccb5fce3f 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -82,6 +82,7 @@ foreach (CBLAS_FLAG ${CBLAS_FLAGS}) GenerateNamedObjects("${BLAS3_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${DISABLE_COMPLEX}) GenerateNamedObjects("${BLAS3_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX}) + GenerateNamedObjects("xerbla.c" "" "xerbla" ${CBLAS_FLAG} "" "" true) #sdsdot, dsdot if (BUILD_SINGLE OR BUILD_DOUBLE) GenerateNamedObjects("sdsdot.c" "" "sdsdot" ${CBLAS_FLAG} "" "" true "SINGLE") @@ -104,6 +105,15 @@ endif () GenerateNamedObjects("imax.c" "USE_ABS;USE_MIN" "i*amin" ${CBLAS_FLAG}) GenerateNamedObjects("imax.c" "USE_MIN" "i*min" ${CBLAS_FLAG}) +if (BUILD_BFLOAT16) + GenerateNamedObjects("bf16dot.c" "" "sbdot" ${CBLAS_FLAG} "" "" true "BFLOAT16") + GenerateNamedObjects("gemm.c" "" "sbgemm" ${CBLAS_FLAG} "" "" true "BFLOAT16") + GenerateNamedObjects("sbgemv.c" "" "sbgemv" ${CBLAS_FLAG} "" "" true "BFLOAT16") + GenerateNamedObjects("tobf16.c" "SINGLE_PREC" "sbstobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") + GenerateNamedObjects("tobf16.c" "DOUBLE_PREC" "sbdtobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") + GenerateNamedObjects("bf16to.c" "SINGLE_PREC" "sbf16tos" ${CBLAS_FLAG} "" "" true "BFLOAT16") + GenerateNamedObjects("bf16to.c" "DOUBLE_PREC" "dbf16tod" ${CBLAS_FLAG} "" "" true "BFLOAT16") +endif () # complex-specific sources foreach (float_type ${FLOAT_TYPES}) diff --git a/interface/gemm.c b/interface/gemm.c index 6dcc54041..71cc77a1b 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -326,7 +326,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS PRINT_DEBUG_CNAME; -#if !defined(COMPLEX) && !defined(DOUBLE) && defined(USE_SGEMM_KERNEL_DIRECT) +#if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && defined(USE_SGEMM_KERNEL_DIRECT) #ifdef DYNAMIC_ARCH if (support_avx512() ) #endif diff --git a/interface/zsyr.c b/interface/zsyr.c index 71d4dbf29..54fb8a4e9 100644 --- a/interface/zsyr.c +++ b/interface/zsyr.c @@ -119,7 +119,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLOAT *x, int incx, FLOAT *a, int lda) { FLOAT *buffer; - int trans, uplo; + int uplo; blasint info; FLOAT * ALPHA = α FLOAT alpha_r = ALPHA[0]; @@ -130,7 +130,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLO PRINT_DEBUG_CNAME; - trans = -1; uplo = -1; info = 0; diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index d8a230436..9ffbd944f 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -91,6 +91,15 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${DSDOTKERNEL}" "DSDOT" "d*dot_k" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${DSDOTKERNEL}" "DSDOT" "dsdot_k" false "" "" false "SINGLE") + # sbdot + if (BUILD_BFLOAT16) + GenerateNamedObjects("${KERNELDIR}/${SBDOTKERNEL}" "SBDOT" "dot_k" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${BF16TOKERNEL}" "SINGLE" "f16tos_k" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${BF16TOKERNEL}" "DOUBLE" "bf16tod_k" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${TOBF16KERNEL}" "SINGLE" "stobf16_k" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${TOBF16KERNEL}" "DOUBLE" "dtobf16_k" false "" "" false "BFLOAT16") + endif() + if ((BUILD_COMPLEX OR BUILD_DOUBLE) AND NOT BUILD_SINGLE) GenerateNamedObjects("${KERNELDIR}/${SAMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${SAMINKERNEL}" "USE_ABS;USE_MIN" "amin_k" false "" "" false "SINGLE") @@ -149,9 +158,6 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("generic/ger.c" "" "ger_k" false "" "" "" 3) foreach (float_type ${FLOAT_TYPES}) string(SUBSTRING ${float_type} 0 1 float_char) - if (${float_type} STREQUAL "BFLOAT16") - set (float_char "SB") - endif () if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") GenerateNamedObjects("${KERNELDIR}/${${float_char}GERUKERNEL}" "" "geru_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GERCKERNEL}" "CONJ" "gerc_k" false "" "" false ${float_type}) @@ -185,6 +191,10 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${SGEMVNKERNEL}" "" "gemv_n" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${SGEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false "SINGLE") endif () + if (BUILD_BFLOAT16) + GenerateNamedObjects("${KERNELDIR}/${SBGEMVNKERNEL}" "" "gemv_n" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SBGEMVTKERNEL}" "" "gemv_t" false "" "" false "BFLOAT16") + endif () # Makefile.L3 set(USE_TRMM false) string(TOUPPER ${TARGET_CORE} UC_TARGET_CORE) @@ -209,15 +219,8 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPERFORMANT}" "" "gemm_direct_performant" false "" "" false SINGLE) endif() - foreach (float_type SINGLE DOUBLE BFLOAT16) + foreach (float_type SINGLE DOUBLE) string(SUBSTRING ${float_type} 0 1 float_char) - if (${float_type} STREQUAL "BFLOAT16") - if (NOT ${BUILD_BFLOAT16}) - continue () - else () - set (float_char "SB") - endif () - endif () GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type}) endforeach() if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) @@ -253,11 +256,24 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${SGEMM_BETA}" "" "gemm_beta" false "" "" false "SINGLE") endif () + if (BUILD_BFLOAT16) + if (SBGEMMINCOPY) + GenerateNamedObjects("${KERNELDIR}/${SBGEMMINCOPY}" "" "${SBGEMMINCOPYOBJ}" false "" "" true "BFLOAT16") + endif () + if (SBGEMMITCOPY) + GenerateNamedObjects("${KERNELDIR}/${SBGEMMITCOPY}" "" "${SBGEMMITCOPYOBJ}" false "" "" true "BFLOAT16") + endif () + if (SBGEMMONCOPY) + GenerateNamedObjects("${KERNELDIR}/${SBGEMMONCOPY}" "" "${SBGEMMONCOPYOBJ}" false "" "" true "BFLOAT16") + endif () + if (SBGEMMOTCOPY) + GenerateNamedObjects("${KERNELDIR}/${SBGEMMOTCOPY}" "" "${SBGEMMOTCOPYOBJ}" false "" "" true "BFLOAT16") + endif () + GenerateNamedObjects("${KERNELDIR}/${SBGEMMKERNEL}" "" "gemm_kernel" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SBGEMM_BETA}" "" "gemm_beta" false "" "" false "BFLOAT16") + endif () foreach (float_type ${FLOAT_TYPES}) string(SUBSTRING ${float_type} 0 1 float_char) - if (${float_type} STREQUAL "BFLOAT16") - set (float_char "SB") - endif () if (${float_char}GEMMINCOPY) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMINCOPY}" "${float_type}" "${${float_char}GEMMINCOPYOBJ}" false "" "" true ${float_type}) endif () @@ -568,6 +584,44 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false ${float_type}) endif () + if (BUILD_BFLOAT16) + if (NOT DEFINED SBGEMM_SMALL_M_PERMIT) + set(SBGEMM_SMALL_M_PERMIT ../generic/gemm_small_matrix_permit.c) + endif () + if (NOT DEFINED SBGEMM_SMALL_K_NN) + set(SBGEMM_SMALL_K_NN ../generic/gemm_small_matrix_kernel_nn.c) + endif () + if (NOT DEFINED SBGEMM_SMALL_K_NT) + set(SBGEMM_SMALL_K_NT ../generic/gemm_small_matrix_kernel_nt.c) + endif () + if (NOT DEFINED SBGEMM_SMALL_K_TN) + set(SBGEMM_SMALL_K_TN ../generic/gemm_small_matrix_kernel_tn.c) + endif () + if (NOT DEFINED SBGEMM_SMALL_K_TT) + set(SBGEMM_SMALL_K_TT ../generic/gemm_small_matrix_kernel_tt.c) + endif () + if (NOT DEFINED SBGEMM_SMALL_K_B0_NN) + set(SBGEMM_SMALL_K_B0_NN ../generic/gemm_small_matrix_kernel_nn.c) + endif () + if (NOT DEFINED SBGEMM_SMALL_K_B0_NT) + set(SBGEMM_SMALL_K_B0_NT ../generic/gemm_small_matrix_kernel_nt.c) + endif () + if (NOT DEFINED SBGEMM_SMALL_K_B0_TN) + set(SBGEMM_SMALL_K_B0_TN ../generic/gemm_small_matrix_kernel_tn.c) + endif () + if (NOT DEFINED SBGEMM_SMALL_K_B0_TT) + set($SBGEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_tt.c) + endif () + GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_M_PERMIT}" "" "gemm_small_matrix_permit" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NT}" "" "gemm_small_kernel_nt" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_TN}" "" "gemm_small_kernel_tn" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NT}" "" "gemm_small_kernel_tt" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NN}" "B0" "gemm_small_kernel_b0_nn" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_nt" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false "BFLOAT16") + endif () endif () if (NOT DEFINED ${float_char}OMATCOPY_CN) @@ -702,6 +756,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) #geadd GenerateNamedObjects("${KERNELDIR}/${${float_char}GEADD_KERNEL}" "" "geadd_k" false "" "" false ${float_type}) endforeach () + if (BUILD_DOUBLE AND NOT BUILD_SINGLE) GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_LN}" "UPPER;LN;TRSMKERNEL" "trsm_kernel_LN" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_LT}" "LT;TRSMKERNEL" "trsm_kernel_LT" false "" "" false "SINGLE") @@ -840,22 +895,22 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("generic/trsm_ltcopy_${SGEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" ${TSUFFIX} false "SINGLE") if (SGEMMINCOPY) - GenerateNamedObjects("${KERNELDIR}/${SGEMMINCOPY}" "SINGLE" "${SGEMMINCOPYOBJ}" false "" "" true "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SGEMMINCOPY}" "SINGLE" "${SGEMMINCOPYOBJ}" false "" "" true "SINGLE") endif () - if (SGEMMITCOPY) - GenerateNamedObjects("${KERNELDIR}/${SGEMMITCOPY}" "SINGLE" "${SGEMMITCOPYOBJ}" false "" "" true "SINGLE") - endif () - if (SGEMMONCOPY) - GenerateNamedObjects("${KERNELDIR}/${SGEMMONCOPY}" "SINGLE" "${SGEMMONCOPYOBJ}" false "" "" true "SINGLE") - endif () - if (SGEMMOTCOPY) - GenerateNamedObjects("${KERNELDIR}/${SGEMMOTCOPY}" "SINGLE" "${SGEMMOTCOPYOBJ}" false "" "" true "SINGLE") + if (SGEMMITCOPY) + GenerateNamedObjects("${KERNELDIR}/${SGEMMITCOPY}" "SINGLE" "${SGEMMITCOPYOBJ}" false "" "" true "SINGLE") + endif () + if (SGEMMONCOPY) + GenerateNamedObjects("${KERNELDIR}/${SGEMMONCOPY}" "SINGLE" "${SGEMMONCOPYOBJ}" false "" "" true "SINGLE") + endif () + if (SGEMMOTCOPY) + GenerateNamedObjects("${KERNELDIR}/${SGEMMOTCOPY}" "SINGLE" "${SGEMMOTCOPYOBJ}" false "" "" true "SINGLE") endif () GenerateNamedObjects("${KERNELDIR}/${SGEMVNKERNEL}" "" "gemv_n" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${SGEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false "SINGLE") endif () - - if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) + + if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) GenerateNamedObjects("generic/neg_tcopy_${DGEMM_UNROLL_M}.c" "" "neg_tcopy" false "" ${TSUFFIX} false "DOUBLE") GenerateNamedObjects("generic/laswp_ncopy_${DGEMM_UNROLL_N}.c" "" "laswp_ncopy" false "" ${TSUFFIX} false "DOUBLE") endif () diff --git a/kernel/arm64/dgemm_tcopy_8.S b/kernel/arm64/dgemm_tcopy_8.S index 9ab51ff57..7e5bf6080 100644 --- a/kernel/arm64/dgemm_tcopy_8.S +++ b/kernel/arm64/dgemm_tcopy_8.S @@ -50,11 +50,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define B03 x16 #define B04 x17 -#define I x18 -#define J x19 +#define I x19 +#define J x20 -#define TEMP1 x20 -#define TEMP2 x21 +#define TEMP1 x21 #define A_PREFETCH 2560 #define B_PREFETCH 256 diff --git a/kernel/arm64/dtrmm_kernel_8x4.S b/kernel/arm64/dtrmm_kernel_8x4.S index 0ac5a5f24..3d953266c 100644 --- a/kernel/arm64/dtrmm_kernel_8x4.S +++ b/kernel/arm64/dtrmm_kernel_8x4.S @@ -49,9 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow3 x15 #define pA x16 #define alpha x17 -#define temp x18 +//#define temp x18 #define tempOffset x19 #define tempK x20 +#define temp x21 #define alpha0 d10 #define alphaV0 v10.d[0] diff --git a/kernel/arm64/sgemm_tcopy_16.S b/kernel/arm64/sgemm_tcopy_16.S index 46198b3a2..431f1ae2a 100644 --- a/kernel/arm64/sgemm_tcopy_16.S +++ b/kernel/arm64/sgemm_tcopy_16.S @@ -30,7 +30,7 @@ All rights reserved. #define B00 x22 -#define I x18 +#define I x21 #define J x19 #define TEMP1 x20 diff --git a/kernel/arm64/strmm_kernel_16x4.S b/kernel/arm64/strmm_kernel_16x4.S index 985a0a9a6..a44326aeb 100644 --- a/kernel/arm64/strmm_kernel_16x4.S +++ b/kernel/arm64/strmm_kernel_16x4.S @@ -49,9 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow3 x15 #define pA x16 #define alpha w17 -#define temp x18 +//#define temp x18 #define tempOffset x19 #define tempK x20 +#define temp x21 #define alpha0 s10 #define alphaV0 v10.s[0] diff --git a/kernel/arm64/zgemm_kernel_4x4.S b/kernel/arm64/zgemm_kernel_4x4.S index f8e877f3c..a65c4f581 100644 --- a/kernel/arm64/zgemm_kernel_4x4.S +++ b/kernel/arm64/zgemm_kernel_4x4.S @@ -48,8 +48,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow2 x14 #define pCRow3 x15 #define pA x16 -#define alphaR x17 -#define alphaI x18 +#define alphaR x19 +#define alphaI x20 #define alpha0_R d10 #define alphaV0_R v10.d[0] diff --git a/kernel/arm64/ztrmm_kernel_4x4.S b/kernel/arm64/ztrmm_kernel_4x4.S index 462acfe2b..cd053b896 100644 --- a/kernel/arm64/ztrmm_kernel_4x4.S +++ b/kernel/arm64/ztrmm_kernel_4x4.S @@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow3 x15 #define pA x16 #define alphaR x17 -#define alphaI x18 +#define alphaI x22 #define temp x19 #define tempOffset x20 #define tempK x21 diff --git a/kernel/generic/dot.c b/kernel/generic/dot.c index 5abbb735c..84568ee0b 100644 --- a/kernel/generic/dot.c +++ b/kernel/generic/dot.c @@ -47,7 +47,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) if ( (inc_x == 1) && (inc_y == 1) ) { - int n1 = n & -4; #if V_SIMD && !defined(DSDOT) const int vstep = v_nlanes_f32; const int unrollx4 = n & (-vstep * 4); @@ -84,6 +83,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) } dot = v_sum_f32(vsum0); #elif defined(DSDOT) + int n1 = n & -4; for (; i < n1; i += 4) { dot += (double) y[i] * (double) x[i] @@ -92,6 +92,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) + (double) y[i+3] * (double) x[i+3] ; } #else + int n1 = n & -4; for (; i < n1; i += 4) { dot += y[i] * x[i] diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index 873653f1e..63816cb5f 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -1,7 +1,6 @@ -ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) +ifeq ($(HAVE_GAS), 1) include $(KERNELDIR)/KERNEL.POWER8 else - #SGEMM_BETA = ../generic/gemm_beta.c #DGEMM_BETA = ../generic/gemm_beta.c #CGEMM_BETA = ../generic/zgemm_beta.c @@ -44,6 +43,7 @@ DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = cgemm_kernel_power10.S +#CGEMMKERNEL = cgemm_kernel_8x4_power8.S CGEMMINCOPY = ../generic/zgemm_ncopy_8.c CGEMMITCOPY = ../generic/zgemm_tcopy_8.c CGEMMONCOPY = ../generic/zgemm_ncopy_4.c @@ -218,5 +218,4 @@ QCABS_KERNEL = ../generic/cabs.c #Dump kernel CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c - endif diff --git a/kernel/power/caxpy_microk_power10.c b/kernel/power/caxpy_microk_power10.c index 56a5ab47a..902eba82c 100644 --- a/kernel/power/caxpy_microk_power10.c +++ b/kernel/power/caxpy_microk_power10.c @@ -36,9 +36,12 @@ static void caxpy_kernel_8 (long n, float *x, float *y, #endif const float *mvecp = mvec; /* We have to load reverse mask for big endian. */ - /* __vector unsigned char mask={ 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}; */ - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + __vector unsigned char mask={ 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}; +#else __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4}; +#endif + long ytmp; __asm__ @@ -112,6 +115,16 @@ static void caxpy_kernel_8 (long n, float *x, float *y, "xvmaddasp 38, 58, 33 \n\t" "xvmaddasp 39, 59, 33 \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 48, 0(%4) \n\t" + "stxv 49, 16(%4) \n\t" + "stxv 50, 32(%4) \n\t" + "stxv 51, 48(%4) \n\t" + "stxv 34, 64(%4) \n\t" + "stxv 35, 80(%4) \n\t" + "stxv 38, 96(%4) \n\t" + "stxv 39, 112(%4) \n\t" +#else "stxv 49, 0(%4) \n\t" "stxv 48, 16(%4) \n\t" "stxv 51, 32(%4) \n\t" @@ -120,6 +133,7 @@ static void caxpy_kernel_8 (long n, float *x, float *y, "stxv 34, 80(%4) \n\t" "stxv 39, 96(%4) \n\t" "stxv 38, 112(%4) \n\t" +#endif "addi %4, %4, 128 \n\t" "xxperm 52, 40, %x10 \n\t" // exchange real and imag part @@ -163,6 +177,16 @@ static void caxpy_kernel_8 (long n, float *x, float *y, "xvmaddasp 38, 58, 33 \n\t" "xvmaddasp 39, 59, 33 \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 48, 0(%4) \n\t" + "stxv 49, 16(%4) \n\t" + "stxv 50, 32(%4) \n\t" + "stxv 51, 48(%4) \n\t" + "stxv 34, 64(%4) \n\t" + "stxv 35, 80(%4) \n\t" + "stxv 38, 96(%4) \n\t" + "stxv 39, 112(%4) \n\t" +#else "stxv 49, 0(%4) \n\t" "stxv 48, 16(%4) \n\t" "stxv 51, 32(%4) \n\t" @@ -171,6 +195,7 @@ static void caxpy_kernel_8 (long n, float *x, float *y, "stxv 34, 80(%4) \n\t" "stxv 39, 96(%4) \n\t" "stxv 38, 112(%4) \n\t" +#endif "#n=%1 x=%5=%2 y=%0=%3 alpha=(%7,%8) mvecp=%6=%9 ytmp=%4\n" : diff --git a/kernel/power/ccopy_microk_power10.c b/kernel/power/ccopy_microk_power10.c index 6c80f9cd4..f30e1fa09 100644 --- a/kernel/power/ccopy_microk_power10.c +++ b/kernel/power/ccopy_microk_power10.c @@ -46,7 +46,16 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) ".align 5 \n" "one%=: \n\t" - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 32, 0(%3) \n\t" + "stxv 33, 16(%3) \n\t" + "stxv 34, 32(%3) \n\t" + "stxv 35, 48(%3) \n\t" + "stxv 36, 64(%3) \n\t" + "stxv 37, 80(%3) \n\t" + "stxv 38, 96(%3) \n\t" + "stxv 39, 112(%3) \n\t" +#else "stxv 33, 0(%3) \n\t" "stxv 32, 16(%3) \n\t" "stxv 35, 32(%3) \n\t" @@ -55,11 +64,21 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) "stxv 36, 80(%3) \n\t" "stxv 39, 96(%3) \n\t" "stxv 38, 112(%3) \n\t" +#endif "lxvp 32, 0(%2) \n\t" "lxvp 34, 32(%2) \n\t" "lxvp 36, 64(%2) \n\t" "lxvp 38, 96(%2) \n\t" - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 40, 128(%3) \n\t" + "stxv 41, 144(%3) \n\t" + "stxv 42, 160(%3) \n\t" + "stxv 43, 176(%3) \n\t" + "stxv 44, 192(%3) \n\t" + "stxv 45, 208(%3) \n\t" + "stxv 46, 224(%3) \n\t" + "stxv 47, 240(%3) \n\t" +#else "stxv 41, 128(%3) \n\t" "stxv 40, 144(%3) \n\t" "stxv 43, 160(%3) \n\t" @@ -68,6 +87,7 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) "stxv 44, 208(%3) \n\t" "stxv 47, 224(%3) \n\t" "stxv 46, 240(%3) \n\t" +#endif "lxvp 40, 128(%2) \n\t" "lxvp 42, 160(%2) \n\t" "lxvp 44, 192(%2) \n\t" @@ -81,7 +101,24 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) "bgt one%= \n" "two%=: \n\t" - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 32, 0(%3) \n\t" + "stxv 33, 16(%3) \n\t" + "stxv 34, 32(%3) \n\t" + "stxv 35, 48(%3) \n\t" + "stxv 36, 64(%3) \n\t" + "stxv 37, 80(%3) \n\t" + "stxv 38, 96(%3) \n\t" + "stxv 39, 112(%3) \n\t" + "stxv 40, 128(%3) \n\t" + "stxv 41, 144(%3) \n\t" + "stxv 42, 160(%3) \n\t" + "stxv 43, 176(%3) \n\t" + "stxv 44, 192(%3) \n\t" + "stxv 45, 208(%3) \n\t" + "stxv 46, 224(%3) \n\t" + "stxv 47, 240(%3) \n\t" +#else "stxv 33, 0(%3) \n\t" "stxv 32, 16(%3) \n\t" "stxv 35, 32(%3) \n\t" @@ -98,7 +135,7 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) "stxv 44, 208(%3) \n\t" "stxv 47, 224(%3) \n\t" "stxv 46, 240(%3) \n\t" - +#endif "#n=%1 x=%4=%2 y=%0=%3" : "=m" (*y), diff --git a/kernel/power/cdot.c b/kernel/power/cdot.c index b9e2d2ce5..c53fe0c02 100644 --- a/kernel/power/cdot.c +++ b/kernel/power/cdot.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #include "common.h" -#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +#if defined(POWER10) #include "cdot_microk_power10.c" #else #ifndef HAVE_KERNEL_8 @@ -120,7 +120,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA if ((inc_x == 1) && (inc_y == 1)) { -#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +#if defined(POWER10) BLASLONG n1 = n & -16; #else BLASLONG n1 = n & -8; diff --git a/kernel/power/cdot_microk_power10.c b/kernel/power/cdot_microk_power10.c index 399f2b180..9d42559c9 100644 --- a/kernel/power/cdot_microk_power10.c +++ b/kernel/power/cdot_microk_power10.c @@ -29,7 +29,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void cdot_kernel_8 (long n, float *x, float *y, float *dot) { +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + __vector unsigned char mask = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11}; +#else __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4}; +#endif __asm__ ( "dcbt 0, %2 \n\t" @@ -153,7 +157,11 @@ static void cdot_kernel_8 (long n, float *x, float *y, float *dot) "xxswapd 33, 34 \n\t" "xvaddsp 35, 35, 32 \n\t" "xvaddsp 34, 34, 33 \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xxpermdi 34, 35, 34, 0 \n\t" +#else "xxpermdi 34, 34, 35, 2 \n\t" +#endif "stxv 34, 0(%6) \n\t" "#n=%1 x=%4=%2 y=%5=%3 dot=%0=%6" diff --git a/kernel/power/cgemm_kernel_power10.S b/kernel/power/cgemm_kernel_power10.S index e04f948dd..fbd22aaad 100644 --- a/kernel/power/cgemm_kernel_power10.S +++ b/kernel/power/cgemm_kernel_power10.S @@ -76,11 +76,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cgemm_macros_power10.S" +#if (_AIX) +.set perm_const1, 0x0405060700010203 +.set perm_const2, 0x0c0d0e0f08090a0b +.set save_permute_12, 0x1011121300010203 +.set save_permute_11, 0x18191a1b08090a0b +#else .equ perm_const1, 0x0405060700010203 .equ perm_const2, 0x0c0d0e0f08090a0b .equ save_permute_12, 0x0c0d0e0f1c1d1e1f .equ save_permute_11, 0x0405060714151617 - +#endif #ifndef NEEDPARAM @@ -172,24 +178,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /*load reverse permute mask for big endian uint128 = 0xc0d0e0f08090a0b0405060700010203 */ - +#if (_AIX) + lis T2, (perm_const2>>48 & 0xFFFF) + lis T1, (perm_const1>>48 & 0xFFFF) + lis T3, (save_permute_12>>48 & 0xFFFF) + lis T4, (save_permute_11>>48 & 0xFFFF) + + ori T2, T2, (perm_const2>>32 & 0xFFFF) + ori T1, T1, (perm_const1>>32 & 0xFFFF) + ori T3, T3, (save_permute_12>>32 & 0xFFFF) + ori T4, T4, (save_permute_11>>32 & 0xFFFF) +#else lis T2, perm_const2@highest lis T1, perm_const1@highest lis T3, save_permute_12@highest lis T4, save_permute_11@highest - ori T2, T2, perm_const2@higher ori T1, T1, perm_const1@higher ori T3, T3, save_permute_12@higher ori T4, T4, save_permute_11@higher - +#endif rldicr T2, T2, 32, 31 rldicr T1, T1, 32, 31 rldicr T3, T3, 32, 31 rldicr T4, T4, 32, 31 +#if (_AIX) + oris T2, T2, (perm_const2>>16 & 0xFFFF) + oris T1, T1, (perm_const1>>16 & 0xFFFF) + oris T3, T3, (save_permute_12>>16 & 0xFFFF) + oris T4, T4, (save_permute_11>>16 & 0xFFFF) + + ori T2, T2, (perm_const2 & 0xFFFF) + ori T1, T1, (perm_const1 & 0xFFFF) + ori T3, T3, (save_permute_12 & 0xFFFF) + ori T4, T4, (save_permute_11 & 0xFFFF) +#else oris T2, T2, perm_const2@h oris T1, T1, perm_const1@h oris T3, T3, save_permute_12@h @@ -200,7 +226,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ori T1, T1, perm_const1@l ori T3, T3, save_permute_12@l ori T4, T4, save_permute_11@l - +#endif li r0,0 li PRE,512 diff --git a/kernel/power/cgemm_macros_power10.S b/kernel/power/cgemm_macros_power10.S index b66e93405..f75bf5dad 100644 --- a/kernel/power/cgemm_macros_power10.S +++ b/kernel/power/cgemm_macros_power10.S @@ -218,6 +218,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 3, 36, 34 + xvf32gerpp 2, 37, 34 + xvf32gerpp 1, 32, 34 + xvf32gerpp 0, 33, 34 + xvf32gerpp 7, 36, 35 + xvf32gerpp 6, 37, 35 + xvf32gerpp 5, 32, 35 + xvf32gerpp 4, 33, 35 +#else xvf32gerpp 3, 36, 35 xvf32gerpp 2, 37, 35 xvf32gerpp 1, 32, 35 @@ -226,6 +236,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvf32gerpp 6, 37, 34 xvf32gerpp 5, 32, 34 xvf32gerpp 4, 33, 34 +#endif .endm .macro LOAD4x8_2 @@ -255,6 +266,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + xvf32gerpp 3, 36, 34 + xvf32gerpp 2, 37, 34 + xvf32gerpp 1, 32, 34 + xvf32gerpp 0, 33, 34 + xvf32gerpp 7, 36, 35 + xvf32gerpp 6, 37, 35 + xvf32gerpp 5, 32, 35 + xvf32gerpp 4, 33, 35 +#else xvf32gerpp 3, 36, 35 xvf32gerpp 2, 37, 35 xvf32gerpp 1, 32, 35 @@ -263,11 +284,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvf32gerpp 6, 37, 34 xvf32gerpp 5, 32, 34 xvf32gerpp 4, 33, 34 +#endif .if \Complete==0 lxvp vs34, DISP8(\Index, \OffsetB)(\BREG) lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG) lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG) .endif +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + xvf32gerpp 3, 42, 38 + xvf32gerpp 2, 43, 38 + xvf32gerpp 1, 40, 38 + xvf32gerpp 0, 41, 38 + xvf32gerpp 7, 42, 39 + xvf32gerpp 6, 43, 39 + xvf32gerpp 5, 40, 39 + xvf32gerpp 4, 41, 39 +#else xvf32gerpp 3, 42, 39 xvf32gerpp 2, 43, 39 xvf32gerpp 1, 40, 39 @@ -276,6 +308,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvf32gerpp 6, 43, 38 xvf32gerpp 5, 40, 38 xvf32gerpp 4, 41, 38 +#endif .if \Complete==0 lxvp vs40, DISP16(\Index, 64+\OffsetA)(\AREG) lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG) @@ -393,22 +426,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. RECONSTRUCT_PAIR2 #ifndef TRMMKERNEL /* add */ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs1, vs0, vs8, 1 + xxpermdi vs3, vs2, vs10, 1 + xxpermdi vs5, vs4, vs12, 1 + xxpermdi vs7, vs6, vs14, 1 + xxpermdi vs9, vs8, vs0, 1 + xxpermdi vs11, vs10, vs2, 1 +#else xxpermdi vs1, vs8, vs0, 2 xxpermdi vs3, vs10, vs2, 2 xxpermdi vs5, vs12, vs4, 2 xxpermdi vs7, vs14, vs6, 2 xxpermdi vs9, vs0, vs8, 2 xxpermdi vs11, vs2, vs10, 2 +#endif xvaddsp vs24, vs24, vs3 xvaddsp vs25, vs25, vs1 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs13, vs12, vs4, 1 + xxpermdi vs15, vs14, vs6, 1 +#else xxpermdi vs13, vs4, vs12, 2 xxpermdi vs15, vs6, vs14, 2 +#endif xvaddsp vs26, vs26, vs7 xvaddsp vs27, vs27, vs5 xvaddsp vs28, vs28, vs11 xvaddsp vs29, vs29, vs9 xvaddsp vs30, vs30, vs15 xvaddsp vs31, vs31, vs13 +#else +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + xxpermdi vs25, vs0, vs8, 1 + xxpermdi vs24, vs2, vs10, 1 + xxpermdi vs27, vs4, vs12, 1 + xxpermdi vs26, vs6, vs14, 1 + xxpermdi vs29, vs8, vs0, 1 + xxpermdi vs28, vs10, vs2, 1 + xxpermdi vs31, vs12, vs4, 1 + xxpermdi vs30, vs14, vs6, 1 #else xxpermdi vs25, vs8, vs0, 2 xxpermdi vs24, vs10, vs2, 2 @@ -418,6 +475,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs28, vs2, vs10, 2 xxpermdi vs31, vs4, vs12, 2 xxpermdi vs30, vs6, vs14, 2 +#endif #endif stxvp vs24, 0(CO) MULT_APLHA_PART1 vs48, vs56, vs0, vs1 @@ -443,22 +501,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. RECONSTRUCT_PAIR2 #ifndef TRMMKERNEL /* add */ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs1, vs0, vs8, 1 + xxpermdi vs3, vs2, vs10, 1 + xxpermdi vs5, vs4, vs12, 1 + xxpermdi vs7, vs6, vs14, 1 + xxpermdi vs9, vs8, vs0, 1 + xxpermdi vs11, vs10, vs2, 1 +#else xxpermdi vs1, vs8, vs0, 2 xxpermdi vs3, vs10, vs2, 2 xxpermdi vs5, vs12, vs4, 2 xxpermdi vs7, vs14, vs6, 2 xxpermdi vs9, vs0, vs8, 2 xxpermdi vs11, vs2, vs10, 2 +#endif xvaddsp vs32, vs32, vs3 xvaddsp vs33, vs33, vs1 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs13, vs12, vs4, 1 + xxpermdi vs15, vs14, vs6, 1 +#else xxpermdi vs13, vs4, vs12, 2 xxpermdi vs15, vs6, vs14, 2 +#endif xvaddsp vs40, vs40, vs7 xvaddsp vs41, vs41, vs5 xvaddsp vs34, vs34, vs11 xvaddsp vs35, vs35, vs9 xvaddsp vs42, vs42, vs15 xvaddsp vs43, vs43, vs13 +#else +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + xxpermdi vs33, vs0, vs8, 1 + xxpermdi vs32, vs2, vs10, 1 + xxpermdi vs41, vs4, vs12, 1 + xxpermdi vs40, vs6, vs14, 1 + xxpermdi vs35, vs8, vs0, 1 + xxpermdi vs34, vs10, vs2, 1 + xxpermdi vs43, vs12, vs4, 1 + xxpermdi vs42, vs14, vs6, 1 #else xxpermdi vs33, vs8, vs0, 2 xxpermdi vs32, vs10, vs2, 2 @@ -468,6 +550,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs34, vs2, vs10, 2 xxpermdi vs43, vs4, vs12, 2 xxpermdi vs42, vs6, vs14, 2 +#endif #endif stxvp vs32, 0(T2) stxvp vs40, 32(T2) @@ -510,10 +593,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 3, 32, 35 + xvf32gerpp 2, 33, 35 + xvf32gerpp 1, 32, 34 + xvf32gerpp 0, 33, 34 +#else xvf32gerpp 3, 32, 34 xvf32gerpp 2, 33, 34 xvf32gerpp 1, 32, 35 xvf32gerpp 0, 33, 35 +#endif .endm .macro LOAD4x4_2 @@ -541,18 +631,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 3, 32, 35 + xvf32gerpp 2, 33, 35 + xvf32gerpp 1, 32, 34 + xvf32gerpp 0, 33, 34 +#else xvf32gerpp 3, 32, 34 xvf32gerpp 2, 33, 34 xvf32gerpp 1, 32, 35 xvf32gerpp 0, 33, 35 +#endif .if \Complete==0 lxvp vs34, DISP8(\Index, \OffsetB)(\BREG) lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG) .endif +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 3, 36, 39 + xvf32gerpp 2, 37, 39 + xvf32gerpp 1, 36, 38 + xvf32gerpp 0, 37, 38 +#else xvf32gerpp 3, 36, 38 xvf32gerpp 2, 37, 38 xvf32gerpp 1, 36, 39 xvf32gerpp 0, 37, 39 +#endif .if \Complete==0 lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG) lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG) @@ -606,6 +710,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. RECONSTRUCT_PAIR2 #ifndef TRMMKERNEL /* add */ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs1, vs0, vs8, 1 + xxpermdi vs3, vs2, vs10, 1 + xxpermdi vs9, vs8, vs0, 1 + xxpermdi vs11, vs10, vs2, 1 + xxpermdi vs5, vs4, vs12, 1 + xxpermdi vs7, vs6, vs14, 1 + xxpermdi vs13, vs12, vs4, 1 + xxpermdi vs15, vs14, vs6, 1 +#else xxpermdi vs1, vs8, vs0, 2 xxpermdi vs3, vs10, vs2, 2 xxpermdi vs9, vs0, vs8, 2 @@ -614,6 +728,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs7, vs14, vs6, 2 xxpermdi vs13, vs4, vs12, 2 xxpermdi vs15, vs6, vs14, 2 +#endif xvaddsp vs24, vs24, vs3 xvaddsp vs25, vs25, vs1 xvaddsp vs26, vs26, vs11 @@ -622,6 +737,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvaddsp vs29, vs29, vs5 xvaddsp vs30, vs30, vs15 xvaddsp vs31, vs31, vs13 +#else +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs25, vs0, vs8, 1 + xxpermdi vs24, vs2, vs10, 1 + xxpermdi vs27, vs8, vs0, 1 + xxpermdi vs26, vs10, vs2, 1 + xxpermdi vs29, vs4, vs12, 1 + xxpermdi vs28, vs6, vs14, 1 + xxpermdi vs31, vs12, vs4, 1 + xxpermdi vs30, vs14, vs6, 1 #else xxpermdi vs25, vs8, vs0, 2 xxpermdi vs24, vs10, vs2, 2 @@ -631,6 +756,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs28, vs14, vs6, 2 xxpermdi vs31, vs4, vs12, 2 xxpermdi vs30, vs6, vs14, 2 +#endif #endif stxvp vs24, 0(CO) stxvp vs26, 0(T1) @@ -672,8 +798,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 1, 35, 32 + xvf32gerpp 0, 34, 32 +#else xvf32gerpp 1, 34, 32 xvf32gerpp 0, 35, 32 +#endif .endm .macro LOAD4x2_2 @@ -700,13 +831,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x2_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 1, 35, 32 + xvf32gerpp 0, 34, 32 +#else xvf32gerpp 1, 34, 33 xvf32gerpp 0, 35, 33 +#endif .if \Complete==0 lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG) .endif +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 1, 37, 33 + xvf32gerpp 0, 36, 33 +#else xvf32gerpp 1, 36, 32 xvf32gerpp 0, 37, 32 +#endif .if \Complete==0 lxvp vs32, DISP4(\Index, \OffsetA)(\AREG) lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG) @@ -757,19 +898,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. RECONSTRUCT_PAIR1 #ifndef TRMMKERNEL /* add */ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs1, vs0, vs8, 0 + xxpermdi vs9, vs2, vs10, 0 + xxpermdi vs3, vs8, vs0, 3 + xxpermdi vs11, vs10, vs2, 3 +#else xxpermdi vs1, vs8, vs0, 0 xxpermdi vs9, vs10, vs2, 0 xxpermdi vs3, vs0, vs8, 3 xxpermdi vs11, vs2, vs10, 3 +#endif xvaddsp vs24, vs24, vs1 xvaddsp vs26, vs26, vs9 xvaddsp vs25, vs25, vs3 xvaddsp vs27, vs27, vs11 +#else +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs24, vs0, vs8, 0 + xxpermdi vs26, vs2, vs10, 0 + xxpermdi vs25, vs8, vs0, 3 + xxpermdi vs27, vs10, vs2, 3 #else xxpermdi vs24, vs8, vs0, 0 xxpermdi vs26, vs10, vs2, 0 xxpermdi vs25, vs0, vs8, 3 xxpermdi vs27, vs2, vs10, 3 +#endif #endif stxv vs24, 0(CO) stxv vs25, 0(T1) @@ -811,8 +966,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 0, 34, 32 + xvf32gerpp 1, 35, 32 +#else xvf32gerpp 0, 35, 32 xvf32gerpp 1, 34, 32 +#endif .endm .macro LOAD4x1_2 @@ -822,8 +982,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro LOAD4x1_2O OffsetA, OffsetB lxv vs32, (\OffsetA)(AO) vspltisb v6, 0 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs33, vs32, vs38, 2 + xxpermdi vs32, vs32, vs38, 0 +#else xxpermdi vs33, vs32, vs38, 0 xxpermdi vs32, vs32, vs38, 2 +#endif lxvp vs34, (0+\OffsetB)(BO) lxvp vs36, (32+\OffsetB)(BO) .endm @@ -842,18 +1007,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x1_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 0, 34, 32 + xvf32gerpp 1, 35, 32 +#else xvf32gerpp 0, 35, 32 xvf32gerpp 1, 34, 32 +#endif .if \Complete==0 lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG) .endif +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 0, 36, 33 + xvf32gerpp 1, 37, 33 +#else xvf32gerpp 0, 37, 33 xvf32gerpp 1, 36, 33 +#endif .if \Complete==0 lxv vs32, DISP2(\Index, \OffsetA)(\AREG) lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG) +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs33, vs32, vs38, 2 + xxpermdi vs32, vs32, vs38, 0 +#else xxpermdi vs33, vs32, vs38, 0 xxpermdi vs32, vs32, vs38, 2 +#endif .endif .if \IsLast==1 .if \Complete==1 @@ -1001,19 +1181,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL2x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 2, 37, 34 + xvf32gerpp 3, 36, 34 + xvf32gerpp 0, 33, 34 + xvf32gerpp 1, 32, 34 +#else xvf32gerpp 2, 37, 35 xvf32gerpp 3, 36, 35 xvf32gerpp 0, 33, 35 xvf32gerpp 1, 32, 35 +#endif .if \Complete==0 lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG) lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG) .endif +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 2, 41, 35 + xvf32gerpp 3, 40, 35 + xvf32gerpp 0, 39, 35 + xvf32gerpp 1, 38, 35 +#else xvf32gerpp 2, 41, 34 xvf32gerpp 3, 40, 34 xvf32gerpp 0, 39, 34 xvf32gerpp 1, 38, 34 +#endif .if \Complete==0 lxvp vs34, DISP4(\Index, \OffsetB)(\BREG) @@ -1068,16 +1262,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. RECONSTRUCT_PAIR2 #ifndef TRMMKERNEL /* add */ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs1, vs0, vs8, 1 + xxpermdi vs3, vs2, vs10, 1 + xxpermdi vs5, vs4, vs12, 1 + xxpermdi vs7, vs6, vs14, 1 + xxpermdi vs9, vs8, vs0, 1 + xxpermdi vs11, vs10, vs2, 1 +#else xxpermdi vs1, vs8, vs0, 2 xxpermdi vs3, vs10, vs2, 2 xxpermdi vs5, vs12, vs4, 2 xxpermdi vs7, vs14, vs6, 2 xxpermdi vs9, vs0, vs8, 2 xxpermdi vs11, vs2, vs10, 2 +#endif xvaddsp vs24, vs24, vs3 xvaddsp vs25, vs25, vs1 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs13, vs12, vs4, 1 + xxpermdi vs15, vs14, vs6, 1 +#else xxpermdi vs13, vs4, vs12, 2 xxpermdi vs15, vs6, vs14, 2 +#endif xvaddsp vs26, vs26, vs7 xvaddsp vs27, vs27, vs5 xvaddsp vs28, vs28, vs11 @@ -1085,6 +1293,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvaddsp vs30, vs30, vs15 xvaddsp vs31, vs31, vs13 #else +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs25, vs0, vs8, 1 + xxpermdi vs24, vs2, vs10, 1 + xxpermdi vs27, vs4, vs12, 1 + xxpermdi vs26, vs6, vs14, 1 + xxpermdi vs29, vs8, vs0, 1 + xxpermdi vs28, vs10, vs2, 1 + xxpermdi vs31, vs12, vs4, 1 + xxpermdi vs30, vs14, vs6, 1 +#else xxpermdi vs25, vs8, vs0, 2 xxpermdi vs24, vs10, vs2, 2 xxpermdi vs27, vs12, vs4, 2 @@ -1093,6 +1311,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs28, vs2, vs10, 2 xxpermdi vs31, vs4, vs12, 2 xxpermdi vs30, vs6, vs14, 2 +#endif #endif stxvp vs24, 0(CO) stxvp vs26, 32(CO) @@ -1161,13 +1380,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL2x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 0, 33, 34 + xvf32gerpp 1, 32, 34 +#else xvf32gerpp 0, 33, 35 xvf32gerpp 1, 32, 35 +#endif .if \Complete==0 lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG) .endif +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 0, 37, 35 + xvf32gerpp 1, 36, 35 +#else xvf32gerpp 0, 37, 34 xvf32gerpp 1, 36, 34 +#endif + .if \Complete==0 lxvp vs34, DISP4(\Index, \OffsetB)(\BREG) lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG) @@ -1206,19 +1436,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. RECONSTRUCT_PAIR1 #ifndef TRMMKERNEL /* add */ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs1, vs0, vs8, 1 + xxpermdi vs3, vs2, vs10, 1 + xxpermdi vs9, vs8, vs0, 1 + xxpermdi vs11, vs10, vs2, 1 +#else xxpermdi vs1, vs8, vs0, 2 xxpermdi vs3, vs10, vs2, 2 xxpermdi vs9, vs0, vs8, 2 xxpermdi vs11, vs2, vs10, 2 +#endif xvaddsp vs24, vs24, vs3 xvaddsp vs25, vs25, vs1 xvaddsp vs26, vs26, vs11 xvaddsp vs27, vs27, vs9 +#else +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs25, vs0, vs8, 1 + xxpermdi vs24, vs2, vs10, 1 + xxpermdi vs27, vs8, vs0, 1 + xxpermdi vs26, vs10, vs2, 1 #else xxpermdi vs25, vs8, vs0, 2 xxpermdi vs24, vs10, vs2, 2 xxpermdi vs27, vs0, vs8, 2 xxpermdi vs26, vs2, vs10, 2 +#endif #endif stxvp vs24, 0(CO) stxvp vs26, 0(T1) @@ -1330,13 +1574,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxperm vs8, vs9, save_permute_1 #ifndef TRMMKERNEL /* add */ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs1, vs0, vs8, 0 + xxpermdi vs9, vs8, vs0, 3 +#else xxpermdi vs1, vs8, vs0, 0 xxpermdi vs9, vs0, vs8, 3 +#endif xvaddsp vs24, vs24, vs1 xvaddsp vs26, vs26, vs9 +#else +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs24, vs0, vs8, 0 + xxpermdi vs26, vs8, vs0, 3 #else xxpermdi vs24, vs8, vs0, 0 xxpermdi vs26, vs0, vs8, 3 +#endif #endif stxv vs24, 0(CO) stxv vs26, 0(T1) @@ -1528,8 +1782,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvp vs32, (0+\OffsetA)(AO) lxvp vs36, (32+\OffsetA)(AO) vspltisb v10, 0 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs35, vs34, vs42, 2 + xxpermdi vs34, vs34, vs42, 0 +#else xxpermdi vs35, vs34, vs42, 0 xxpermdi vs34, vs34, vs42, 2 +#endif lxvp vs38, (64+\OffsetA)(AO) lxvp vs40, (64+32+\OffsetA)(AO) .endm @@ -1567,8 +1826,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvf32gerpp 3, 35, 40 .if \Complete==0 lxv vs34, DISP2(\Index, \OffsetB)(\BREG) +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs35, vs34, vs42, 2 + xxpermdi vs34, vs34, vs42, 0 +#else xxpermdi vs35, vs34, vs42, 0 xxpermdi vs34, vs34, vs42, 2 +#endif lxvp vs40, DISP16(\Index, 64+32+\OffsetA)(\AREG) .endif .if \IsLast==1 @@ -1634,10 +1898,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. MULT_APLHA_PART2 vs34, vs42, vs4, vs5 MULT_APLHA_PART2 vs35, vs43, vs6, vs7 /* reconstruct r, i pairs*/ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxperm vs0, vs1, save_permute_1 + xxperm vs2, vs3, save_permute_1 + xxperm vs4, vs5, save_permute_1 + xxperm vs6, vs7, save_permute_1 +#else xxperm vs0, vs1, vs28 xxperm vs2, vs3, vs28 xxperm vs4, vs5, vs28 xxperm vs6, vs7, vs28 +#endif #ifndef TRMMKERNEL /* add */ xvaddsp vs24, vs24, vs2 @@ -1648,10 +1919,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvp vs26, 32(CO) #else /* reconstruct r, i pairs*/ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxv vs2, 0(CO) + stxv vs0, 16(CO) + stxv vs6, 32(CO) + stxv vs4, 48(CO) +#else stxv vs0, 0(CO) stxv vs2, 16(CO) stxv vs4, 32(CO) stxv vs6, 48(CO) +#endif #endif addi CO, CO, 64 .endm @@ -1701,8 +1979,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxv vs34, (\OffsetB)(BO) lxvp vs32, (0+\OffsetA)(AO) vspltisb v6, 0 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs35, vs34, vs38, 2 + xxpermdi vs34, vs34, vs38, 0 +#else xxpermdi vs35, vs34, vs38, 0 xxpermdi vs34, vs34, vs38, 2 +#endif lxvp vs36, (32+\OffsetA)(AO) .endm @@ -1729,8 +2012,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvf32gerpp 1, 35, 36 .if \Complete==0 lxv vs34, DISP2(\Index, \OffsetB)(\BREG) +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs35, vs34, vs38, 2 + xxpermdi vs34, vs34, vs38, 0 +#else xxpermdi vs35, vs34, vs38, 0 xxpermdi vs34, vs34, vs38, 2 +#endif lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG) .endif .if \IsLast==1 @@ -1775,8 +2063,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. MULT_APLHA_PART2 vs32, vs40, vs0, vs1 MULT_APLHA_PART2 vs33, vs41, vs2, vs3 /* reconstruct r, i pairs*/ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxperm vs0, vs1, save_permute_1 + xxperm vs2, vs3, save_permute_1 +#else xxperm vs0, vs1, vs28 xxperm vs2, vs3, vs28 +#endif #ifndef TRMMKERNEL /* add */ xvaddsp vs24, vs24, vs2 @@ -1784,8 +2077,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvp vs24, 0(CO) #else /* reconstruct r, i pairs*/ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxv vs2, 0(CO) + stxv vs0, 16(CO) +#else stxv vs0, 0(CO) stxv vs2, 16(CO) +#endif #endif addi CO, CO, 32 .endm @@ -1904,7 +2202,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. MULT_APLHA_PART1 vs32, vs40, vs0, vs1 MULT_APLHA_PART2 vs32, vs40, vs0, vs1 /* reconstruct r, i pairs*/ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxperm vs0, vs1, save_permute_1 +#else xxperm vs0, vs1, vs28 +#endif #ifndef TRMMKERNEL /* add */ xvaddsp vs24, vs24, vs0 @@ -2018,7 +2320,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. MULT_APLHA_PART1 vs32, vs40, vs37, vs1 MULT_APLHA_PART2 vs32, vs40, vs37, vs1 /* reconstruct r, i pairs*/ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxperm vs37, vs1, save_permute_1 +#else xxperm vs37, vs1, vs28 +#endif #ifndef TRMMKERNEL /* add */ xvaddsp vs36, vs36, vs37 diff --git a/kernel/power/cscal_microk_power10.c b/kernel/power/cscal_microk_power10.c index 70b50809e..d6a91f079 100644 --- a/kernel/power/cscal_microk_power10.c +++ b/kernel/power/cscal_microk_power10.c @@ -30,7 +30,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void zscal_kernel_8 (long n, float *x, float alpha_r, float alpha_i) { __vector float t0 = {-alpha_i, alpha_i, -alpha_i, alpha_i}; +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + __vector unsigned char mask = {4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}; +#else __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4}; +#endif __asm__ ( "dcbt 0, %2 \n\t" diff --git a/kernel/power/cswap.c b/kernel/power/cswap.c index c2fde1c44..4d9b9ccd6 100644 --- a/kernel/power/cswap.c +++ b/kernel/power/cswap.c @@ -39,10 +39,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "cswap_microk_power8.c" -#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) -#include "cswap_microk_power10.c" #elif defined(POWER10) -#include "cswap_microk_power8.c" +#include "cswap_microk_power10.c" #endif #endif diff --git a/kernel/power/dasum.c b/kernel/power/dasum.c index 35390dd24..9ed0af767 100644 --- a/kernel/power/dasum.c +++ b/kernel/power/dasum.c @@ -49,14 +49,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "dasum_microk_power8.c" -#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) -#include "dasum_microk_power10.c" #elif defined(POWER10) -#include "dasum_microk_power8.c" +#include "dasum_microk_power10.c" #endif #endif - #ifndef HAVE_KERNEL_16 static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1) @@ -114,7 +111,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if ( inc_x == 1 ) { -#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +#if defined(POWER10) if ( n >= 32) { BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; diff --git a/kernel/power/dgemv_n_microk_power10.c b/kernel/power/dgemv_n_microk_power10.c index e47de2cb5..65743731e 100644 --- a/kernel/power/dgemv_n_microk_power10.c +++ b/kernel/power/dgemv_n_microk_power10.c @@ -40,18 +40,27 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y XXSPLTD_S(32,%x9,0) // alpha, alpha "sldi %6, %13, 3 \n\t" // lda * sizeof (double) - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmuldp 34, 40, 32 \n\t" // x0 * alpha, x1 * alpha + "xvmuldp 35, 41, 32 \n\t" // x2 * alpha, x3 * alpha +#else "xvmuldp 34, 41, 32 \n\t" // x0 * alpha, x1 * alpha "xvmuldp 35, 40, 32 \n\t" // x2 * alpha, x3 * alpha +#endif "add %4, %3, %6 \n\t" // a0 = ap, a1 = a0 + lda "add %6, %6, %6 \n\t" // 2 * lda - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + XXSPLTD_S(32,34,0) // x0 * alpha, x0 * alpha + XXSPLTD_S(33,34,1) // x1 * alpha, x1 * alpha + XXSPLTD_S(34,35,0) // x2 * alpha, x2 * alpha + XXSPLTD_S(35,35,1) // x3 * alpha, x3 * alpha +#else XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha XXSPLTD_S(35,35,0) // x3 * alpha, x3 * alpha - +#endif "add %5, %3, %6 \n\t" // a2 = a0 + 2 * lda "add %6, %4, %6 \n\t" // a3 = a1 + 2 * lda @@ -286,6 +295,16 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y "add %4, %3, %10 \n\t" // a0 = ap, a1 = a0 + lda "add %10, %10, %10 \n\t" // 2 * lda +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + XXSPLTD_S(32,34,0) // x0 * alpha, x0 * alpha + XXSPLTD_S(33,34,1) // x1 * alpha, x1 * alpha + XXSPLTD_S(34,35,0) // x2 * alpha, x2 * alpha + XXSPLTD_S(35,35,1) // x3 * alpha, x3 * alpha + XXSPLTD_S(48,39,0) // x6 * alpha, x6 * alpha + XXSPLTD_S(49,39,1) // x7 * alpha, x7 * alpha + XXSPLTD_S(39,38,1) // x5 * alpha, x5 * alpha + XXSPLTD_S(38,38,0) // x4 * alpha, x4 * alpha +#else XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha @@ -294,6 +313,7 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y XXSPLTD_S(49,39,0) // x7 * alpha, x7 * alpha XXSPLTD_S(39,38,0) // x5 * alpha, x5 * alpha XXSPLTD_S(38,38,1) // x4 * alpha, x4 * alpha +#endif "add %5, %3, %10 \n\t" // a2 = a0 + 2 * lda "add %6, %4, %10 \n\t" // a3 = a1 + 2 * lda @@ -319,30 +339,69 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y "one%=: \n\t" "lxvp 36, 0( %2) \n\t" // y0, y1 - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 36, 40, 32 \n\t" + "xvmaddadp 37, 41, 32 \n\t" +#else "xvmaddadp 36, 40, 34 \n\t" "xvmaddadp 37, 41, 34 \n\t" +#endif "lxvpx 40, %3, %11 \n\t" // a0[0], a0[1] +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 36, 42, 33 \n\t" + "xvmaddadp 37, 43, 33 \n\t" +#else "xvmaddadp 36, 42, 35 \n\t" "xvmaddadp 37, 43, 35 \n\t" +#endif "lxvpx 42, %4, %11 \n\t" // a1[0], a1[1] +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 36, 44, 34 \n\t" + "xvmaddadp 37, 45, 34 \n\t" +#else "xvmaddadp 36, 44, 32 \n\t" "xvmaddadp 37, 45, 32 \n\t" +#endif "lxvpx 44, %5, %11 \n\t" // a2[0], a2[1] +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 36, 46, 35 \n\t" + "xvmaddadp 37, 47, 35 \n\t" +#else "xvmaddadp 36, 46, 33 \n\t" "xvmaddadp 37, 47, 33 \n\t" +#endif "lxvpx 46, %6, %11 \n\t" // a3[0], a3[1] +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 36, 50, 38 \n\t" + "xvmaddadp 37, 51, 38 \n\t" +#else "xvmaddadp 36, 50, 48 \n\t" "xvmaddadp 37, 51, 48 \n\t" +#endif "lxvpx 50, %7, %11 \n\t" // a4[0] +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 36, 52, 39 \n\t" + "xvmaddadp 37, 53, 39 \n\t" +#else "xvmaddadp 36, 52, 49 \n\t" "xvmaddadp 37, 53, 49 \n\t" +#endif "lxvpx 52, %8, %11 \n\t" // a5[0] +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 36, 54, 48 \n\t" + "xvmaddadp 37, 55, 48 \n\t" +#else "xvmaddadp 36, 54, 38 \n\t" "xvmaddadp 37, 55, 38 \n\t" +#endif "lxvpx 54, %9, %11 \n\t" // a6[0] +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 36, 56, 49 \n\t" + "xvmaddadp 37, 57, 49 \n\t" +#else "xvmaddadp 36, 56, 39 \n\t" "xvmaddadp 37, 57, 39 \n\t" +#endif "lxvpx 56, %10, %11 \n\t" // a7[0] "addi %11, %11, 32 \n\t" @@ -355,6 +414,24 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y "two%=: \n\t" "lxvp 36, 0( %2) \n\t" // y0, y1 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 36, 40, 32 \n\t" + "xvmaddadp 37, 41, 32 \n\t" + "xvmaddadp 36, 42, 33 \n\t" + "xvmaddadp 37, 43, 33 \n\t" + "xvmaddadp 36, 44, 34 \n\t" + "xvmaddadp 37, 45, 34 \n\t" + "xvmaddadp 36, 46, 35 \n\t" + "xvmaddadp 37, 47, 35 \n\t" + "xvmaddadp 36, 50, 38 \n\t" + "xvmaddadp 37, 51, 38 \n\t" + "xvmaddadp 36, 52, 39 \n\t" + "xvmaddadp 37, 53, 39 \n\t" + "xvmaddadp 36, 54, 48 \n\t" + "xvmaddadp 37, 55, 48 \n\t" + "xvmaddadp 36, 56, 49 \n\t" + "xvmaddadp 37, 57, 49 \n\t" +#else "xvmaddadp 36, 40, 34 \n\t" "xvmaddadp 37, 41, 34 \n\t" "xvmaddadp 36, 42, 35 \n\t" @@ -371,6 +448,7 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y "xvmaddadp 37, 55, 38 \n\t" "xvmaddadp 36, 56, 39 \n\t" "xvmaddadp 37, 57, 39 \n\t" +#endif "stxvp 36, 0( %2) \n\t" // y0, y1 : diff --git a/kernel/power/dgemv_t_power10.c b/kernel/power/dgemv_t_power10.c index 3db4d5785..899b2a04b 100644 --- a/kernel/power/dgemv_t_power10.c +++ b/kernel/power/dgemv_t_power10.c @@ -279,34 +279,58 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do "lxvp 40, 32(%[y]) \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + XXMRGHD_S(42,34,35) + XXMRGLD_S(43,34,35) + XXMRGHD_S(44,4,5) + XXMRGLD_S(45,4,5) +#else XXMRGLD_S(42,35,34) XXMRGHD_S(43,35,34) XXMRGLD_S(44,5,4) XXMRGHD_S(45,5,4) +#endif "xvadddp 42,42,43 \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + XXMRGHD_S(46,6,7) + XXMRGLD_S(47,6,7) +#else XXMRGLD_S(46,7,6) XXMRGHD_S(47,7,6) - +#endif "xvadddp 44,44,45 \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + XXMRGHD_S(48,8,9) + XXMRGLD_S(49,8,9) +#else XXMRGLD_S(48,9,8) XXMRGHD_S(49,9,8) - +#endif "xvadddp 46,46,47 \n\t" - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 38,42,36 \n\t" + "xvmaddadp 39,44,36 \n\t" +#else "xvmaddadp 39,42,36 \n\t" "xvmaddadp 38,44,36 \n\t" - +#endif "xvadddp 48,48,49 \n\t" - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 41,48,36 \n\t" +#else "xvmaddadp 41,46,36 \n\t" - +#endif "stxvp 38, 0(%[y]) \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 40,46,36 \n\t" +#else "xvmaddadp 40,48,36 \n\t" +#endif "stxvp 40, 32(%[y]) \n\t" : [memy] "+m" (*(double (*)[8])y), diff --git a/kernel/power/drot.c b/kernel/power/drot.c index 3229878e4..2aa0b8055 100644 --- a/kernel/power/drot.c +++ b/kernel/power/drot.c @@ -42,10 +42,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "drot_microk_power8.c" -#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) -#include "drot_microk_power10.c" #elif defined(POWER10) -#include "drot_microk_power8.c" +#include "drot_microk_power10.c" #endif #endif @@ -110,8 +108,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT { BLASLONG i=0; BLASLONG ix=0,iy=0; - FLOAT *x1=x; - FLOAT *y1=y; FLOAT temp; if ( n <= 0 ) return(0); @@ -119,7 +115,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT if ( (inc_x == 1) && (inc_y == 1) ) { -#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +#if defined(POWER10) if ( n >= 16 ) { BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; @@ -139,7 +135,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT BLASLONG n1 = n & -16; if ( n1 > 0 ) { - drot_kernel_16(n1, x1, y1, c, s); + drot_kernel_16(n1, x, y, c, s); i=n1; } #endif diff --git a/kernel/power/dscal.c b/kernel/power/dscal.c index 32c39a8f4..96c4e51bc 100644 --- a/kernel/power/dscal.c +++ b/kernel/power/dscal.c @@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "dscal_microk_power8.c" -#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) -#include "dscal_microk_power10.c" #elif defined(POWER10) -#include "dscal_microk_power8.c" +#include "dscal_microk_power10.c" #endif #endif @@ -104,7 +102,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( da == 0.0 ) { -#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +#if defined(POWER10) if ( n >= 16 ) { BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; @@ -138,7 +136,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS else { -#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +#if defined(POWER10) if ( n >= 16 ) { BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; diff --git a/kernel/power/dswap.c b/kernel/power/dswap.c index 12476965b..9e6229c6a 100644 --- a/kernel/power/dswap.c +++ b/kernel/power/dswap.c @@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "dswap_microk_power8.c" -#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) -#include "swap_microk_power10.c" #elif defined(POWER10) -#include "dswap_microk_power8.c" +#include "swap_microk_power10.c" #endif #endif @@ -119,7 +117,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, if ( (inc_x == 1) && (inc_y == 1 )) { -#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +#if defined(POWER10) if ( n >= 32 ) { BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; diff --git a/kernel/power/idamax.c b/kernel/power/idamax.c index 5016f67dd..f1ef00066 100644 --- a/kernel/power/idamax.c +++ b/kernel/power/idamax.c @@ -330,10 +330,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { if (inc_x == 1) { - BLASLONG n1 = n & -32; #if defined(_CALL_ELF) && (_CALL_ELF == 2) #if defined(__VEC__) || defined(__ALTIVEC__) + BLASLONG n1 = n & -32; if (n1 > 0) { max = diamax_kernel_32(n1, x, &maxf); diff --git a/kernel/power/sasum.c b/kernel/power/sasum.c index 991d27508..af692a7fa 100644 --- a/kernel/power/sasum.c +++ b/kernel/power/sasum.c @@ -49,10 +49,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "sasum_microk_power8.c" -#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) -#include "sasum_microk_power10.c" #elif defined(POWER10) -#include "sasum_microk_power8.c" +#include "sasum_microk_power10.c" #endif #endif @@ -114,7 +112,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if ( inc_x == 1 ) { -#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +#if defined(POWER10) if ( n >= 32 ) { BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; diff --git a/kernel/power/srot.c b/kernel/power/srot.c index 5a0d4b12e..3e4f93e2a 100644 --- a/kernel/power/srot.c +++ b/kernel/power/srot.c @@ -42,10 +42,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "srot_microk_power8.c" -#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) -#include "srot_microk_power10.c" #elif defined(POWER10) -#include "srot_microk_power8.c" +#include "srot_microk_power10.c" #endif #endif @@ -119,7 +117,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT if ( (inc_x == 1) && (inc_y == 1) ) { -#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +#if defined(POWER10) if ( n >= 16 ) { BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; diff --git a/kernel/power/sscal.c b/kernel/power/sscal.c index 9ae9ccab8..65572a8c1 100644 --- a/kernel/power/sscal.c +++ b/kernel/power/sscal.c @@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "sscal_microk_power8.c" -#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) -#include "sscal_microk_power10.c" #elif defined(POWER10) -#include "sscal_microk_power8.c" +#include "sscal_microk_power10.c" #endif #endif @@ -106,7 +104,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( da == 0.0 ) { -#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +#if defined(POWER10) if ( n >= 32 ) { BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; @@ -140,7 +138,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS else { -#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +#if defined(POWER10) if ( n >= 32 ) { BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; diff --git a/kernel/power/sswap.c b/kernel/power/sswap.c index 955ed02f0..dd249fd36 100644 --- a/kernel/power/sswap.c +++ b/kernel/power/sswap.c @@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "sswap_microk_power8.c" -#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) -#include "swap_microk_power10.c" #elif defined(POWER10) -#include "sswap_microk_power8.c" +#include "swap_microk_power10.c" #endif #endif @@ -119,7 +117,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, if ( (inc_x == 1) && (inc_y == 1 )) { -#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +#if defined(POWER10) if ( n >= 64 ) { BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; diff --git a/kernel/power/trsm_kernel_LN_power10.c b/kernel/power/trsm_kernel_LN_power10.c index 5ca1603a6..246c3a236 100644 --- a/kernel/power/trsm_kernel_LN_power10.c +++ b/kernel/power/trsm_kernel_LN_power10.c @@ -389,7 +389,6 @@ static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b, vector FLOAT *Vc6 = (vector FLOAT *) c6; vector FLOAT *Vc7 = (vector FLOAT *) c7; vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7; - int j; b[120] = (c0[15] *= a[255]); b[121] = (c1[15] *= a[255]); diff --git a/kernel/power/trsm_kernel_LT_power10.c b/kernel/power/trsm_kernel_LT_power10.c index 14ff12fe4..51f3a4e61 100644 --- a/kernel/power/trsm_kernel_LT_power10.c +++ b/kernel/power/trsm_kernel_LT_power10.c @@ -390,7 +390,6 @@ static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b, vector FLOAT *Vc6 = (vector FLOAT *) c6; vector FLOAT *Vc7 = (vector FLOAT *) c7; vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7; - int j; b[0] = (c0[0] *= a[0]); b[1] = (c1[0] *= a[0]); diff --git a/kernel/power/zaxpy_microk_power10.c b/kernel/power/zaxpy_microk_power10.c index 8e593bbfa..b03508b09 100644 --- a/kernel/power/zaxpy_microk_power10.c +++ b/kernel/power/zaxpy_microk_power10.c @@ -30,9 +30,17 @@ static void zaxpy_kernel_4 (long n, double *x, double *y, double alpha_r, double alpha_i) { #if !defined(CONJ) +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + static const double mvec[2] = { -1.0, 1.0 }; +#else + static const double mvec[2] = { 1.0, -1.0 }; +#endif +#else +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) static const double mvec[2] = { 1.0, -1.0 }; #else static const double mvec[2] = { -1.0, 1.0 }; +#endif #endif const double *mvecp = mvec; diff --git a/kernel/power/zgemm_kernel_power10.S b/kernel/power/zgemm_kernel_power10.S index fca389e69..afee8f183 100644 --- a/kernel/power/zgemm_kernel_power10.S +++ b/kernel/power/zgemm_kernel_power10.S @@ -147,13 +147,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. std r0, FLINK_SAVE(SP) -#if defined(linux) || defined(__FreeBSD__) +#if defined(linux) || defined(__FreeBSD__) || defined(_AIX) ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) #endif #ifdef TRMMKERNEL -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__) || defined(_AIX)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) #endif #endif diff --git a/kernel/power/zgemm_macros_power10.S b/kernel/power/zgemm_macros_power10.S index 42f9c5ad4..e5e5ec0e6 100644 --- a/kernel/power/zgemm_macros_power10.S +++ b/kernel/power/zgemm_macros_power10.S @@ -41,23 +41,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef TRMMKERNEL lxv \VS_TEMP1, DISPX(\LOFFSET)(\REG) lxv \VS_TEMP2, DISPX(\LOFFSET+16)(\REG) +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxmrghd \VS_OUT1,\VS_TEMP1,\VS_TEMP2 + xxmrgld \VS_OUT2,\VS_TEMP1,\VS_TEMP2 +#else xxmrgld \VS_OUT1,\VS_TEMP1,\VS_TEMP2 xxmrghd \VS_OUT2,\VS_TEMP1,\VS_TEMP2 +#endif #endif .endm /*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/ .macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxmrghd \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/ + xxmrgld \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/ +#else xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/ xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/ +#endif .endm /*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/ .macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxmrghd \VSOUT1, \VSIN1,\VSIN2 /* real*imag */ + xxmrgld \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/ +#else xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*imag */ xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/ +#endif .endm /* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/ @@ -103,8 +118,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxmrghd \VSOUT1,\VSIN1,\VSIN2 + xxmrgld \VSOUT2,\VSIN1,\VSIN2 +#else xxmrghd \VSOUT1,\VSIN2,\VSIN1 xxmrgld \VSOUT2,\VSIN2,\VSIN1 +#endif .endm @@ -186,15 +206,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs34,vs35 #ifndef TRMMKERNEL lxv vs50, (\LOFFSET)(\BASE_REG) +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxmrghd vs46,vs50,vs50 + xxmrgld vs47,vs50,vs50 +#else xxmrgld vs46,vs50,vs50 xxmrghd vs47,vs50,vs50 +#endif #endif RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes2,vs36,vs37 AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37 MULT_APLHA_PART1 vs34,vs36, vs46,vs47 MULT_APLHA_PART2 vs34,vs36, vs46,vs47 UNPACK_FOR_STORE vs46,vs47,vs39,vs41 +#if (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) xxmrghd vs39,vs47,vs46 +#endif stxv vs39, (\LOFFSET)(\BASE_REG) .endm @@ -232,6 +259,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvp vs44, DISP16(\Index,192)(AO) // load real,imag from A lxvp vs46, DISP16(\Index,224)(AO) // load real,imag from A lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 1, vs34, vs48 + xvf64gerpp 2, vs36, vs48 + xvf64gerpp 3, vs38, vs48 + xvf64gerpp 4, vs32, vs49 + xvf64gerpp 5, vs34, vs49 + xvf64gerpp 6, vs36, vs49 + xvf64gerpp 7, vs38, vs49 +#else xvf64gerpp 0, vs32, vs49 xvf64gerpp 1, vs34, vs49 xvf64gerpp 2, vs36, vs49 @@ -240,11 +277,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvf64gerpp 5, vs34, vs48 xvf64gerpp 6, vs36, vs48 xvf64gerpp 7, vs38, vs48 +#endif lxvp vs32, DISP16(\Index, 256)(AO) // load real,imag from A lxvp vs34, DISP16(\Index, 288)(AO) // load real,imag from A lxvp vs36, DISP16(\Index, 320)(AO) // load real,imag from A lxvp vs38, DISP16(\Index, 352)(AO) // load real,imag from A lxvp vs48, DISP4(\Index, 64)(BO) // load real imag from B +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs40, vs50 + xvf64gerpp 1, vs42, vs50 + xvf64gerpp 2, vs44, vs50 + xvf64gerpp 3, vs46, vs50 + xvf64gerpp 4, vs40, vs51 + xvf64gerpp 5, vs42, vs51 + xvf64gerpp 6, vs44, vs51 + xvf64gerpp 7, vs46, vs51 +#else xvf64gerpp 0, vs40, vs51 xvf64gerpp 1, vs42, vs51 xvf64gerpp 2, vs44, vs51 @@ -253,6 +301,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvf64gerpp 5, vs42, vs50 xvf64gerpp 6, vs44, vs50 xvf64gerpp 7, vs46, vs50 +#endif .if \IsLast==1 addi AO, AO, DISP16(\Index,256) addi BO, BO, DISP4(\Index,64) @@ -261,6 +310,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro LOAD_END_2x8 OffsetA,OffsetB +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 1, vs34, vs48 + xvf64gerpp 2, vs36, vs48 + xvf64gerpp 3, vs38, vs48 + xvf64gerpp 4, vs32, vs49 + xvf64gerpp 5, vs34, vs49 + xvf64gerpp 6, vs36, vs49 + xvf64gerpp 7, vs38, vs49 +#else xvf64gerpp 0, vs32, vs49 xvf64gerpp 1, vs34, vs49 xvf64gerpp 2, vs36, vs49 @@ -269,6 +328,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvf64gerpp 5, vs34, vs48 xvf64gerpp 6, vs36, vs48 xvf64gerpp 7, vs38, vs48 +#endif addi BO, BO, \OffsetB addi AO, AO, \OffsetA .endm @@ -305,7 +365,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs45, vs12, vs13, 0b10 xxpermdi vs46, vs14, vs15, 0b01 xxpermdi vs47, vs14, vs15, 0b10 - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxlor vs0, vs32, vs32 + xxlor vs1, vs33, vs33 + xxlor vs2, vs34, vs34 + xxlor vs3, vs35, vs35 + xxlor vs4, vs36, vs36 + xxlor vs5, vs37, vs37 + xxlor vs6, vs38, vs38 + xxlor vs7, vs39, vs39 + xxlor vs8, vs40, vs40 + xxlor vs9, vs41, vs41 + xxlor vs10, vs42, vs42 + xxlor vs11, vs43, vs43 + xxlor vs12, vs44, vs44 + xxlor vs13, vs45, vs45 + xxlor vs14, vs46, vs46 + xxlor vs15, vs47, vs47 +#else xxlor vs2, vs32, vs32 xxlor vs3, vs33, vs33 xxlor vs0, vs34, vs34 @@ -322,7 +399,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlor vs15, vs45, vs45 xxlor vs12, vs46, vs46 xxlor vs13, vs47, vs47 - +#endif xxpermdi vs32, vs16, vs17, 0b01 xxpermdi vs33, vs16, vs17, 0b10 xxpermdi vs34, vs18, vs19, 0b01 @@ -339,7 +416,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs45, vs28, vs29, 0b10 xxpermdi vs46, vs30, vs31, 0b01 xxpermdi vs47, vs30, vs31, 0b10 - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxlor vs16, vs32, vs32 + xxlor vs17, vs33, vs33 + xxlor vs18, vs34, vs34 + xxlor vs19, vs35, vs35 + xxlor vs20, vs36, vs36 + xxlor vs21, vs37, vs37 + xxlor vs22, vs38, vs38 + xxlor vs23, vs39, vs39 + xxlor vs24, vs40, vs40 + xxlor vs25, vs41, vs41 + xxlor vs26, vs42, vs42 + xxlor vs27, vs43, vs43 + xxlor vs28, vs44, vs44 + xxlor vs29, vs45, vs45 + xxlor vs30, vs46, vs46 + xxlor vs31, vs47, vs47 +#else xxlor vs18, vs32, vs32 xxlor vs19, vs33, vs33 xxlor vs16, vs34, vs34 @@ -356,7 +450,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlor vs31, vs45, vs45 xxlor vs28, vs46, vs46 xxlor vs29, vs47, vs47 - +#endif SAVE8 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0 SAVE8 vs16,vs17,vs18,vs19,vs20,vs21,vs22,vs23,vs24,vs25,vs26,vs27,vs28,vs29,vs30,vs31,T1,0 addi CO, CO, 128 @@ -388,17 +482,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvp vs40, DISP8(\Index, 64)(AO) // load real,imag from A lxvp vs42, DISP8(\Index, 96)(AO) // load real,imag from A lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B - xvf64gerpp 0, vs32, vs49 - xvf64gerpp 1, vs34, vs49 - xvf64gerpp 2, vs32, vs48 - xvf64gerpp 3, vs34, vs48 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 1, vs34, vs48 + xvf64gerpp 2, vs32, vs49 + xvf64gerpp 3, vs34, vs49 +#else + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 2, vs32, vs48 + xvf64gerpp 3, vs34, vs48 +#endif lxvp vs32, DISP8(\Index, 128)(AO) // load real,imag from A lxvp vs34, DISP8(\Index, 160)(AO) // load real,imag from A lxvp vs48, DISP4(\Index, 64)(BO) // load real,imag from B - xvf64gerpp 0, vs40, vs51 - xvf64gerpp 1, vs42, vs51 - xvf64gerpp 2, vs40, vs50 - xvf64gerpp 3, vs42, vs50 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs40, vs50 + xvf64gerpp 1, vs42, vs50 + xvf64gerpp 2, vs40, vs51 + xvf64gerpp 3, vs42, vs51 +#else + xvf64gerpp 0, vs40, vs51 + xvf64gerpp 1, vs42, vs51 + xvf64gerpp 2, vs40, vs50 + xvf64gerpp 3, vs42, vs50 +#endif .if \IsLast==1 addi AO, AO, DISP8(\Index,128) addi BO, BO, DISP4(\Index,64) @@ -407,10 +515,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro LOAD_END_2x4 OffsetA, OffsetB - xvf64gerpp 0, vs32, vs49 - xvf64gerpp 1, vs34, vs49 - xvf64gerpp 2, vs32, vs48 - xvf64gerpp 3, vs34, vs48 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 1, vs34, vs48 + xvf64gerpp 2, vs32, vs49 + xvf64gerpp 3, vs34, vs49 +#else + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 2, vs32, vs48 + xvf64gerpp 3, vs34, vs48 +#endif addi BO, BO, \OffsetB addi AO, AO, \OffsetA .endm @@ -443,7 +558,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs45, vs12, vs13, 0b10 xxpermdi vs46, vs14, vs15, 0b01 xxpermdi vs47, vs14, vs15, 0b10 - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxlor vs0, vs32, vs32 + xxlor vs1, vs33, vs33 + xxlor vs2, vs34, vs34 + xxlor vs3, vs35, vs35 + xxlor vs4, vs36, vs36 + xxlor vs5, vs37, vs37 + xxlor vs6, vs38, vs38 + xxlor vs7, vs39, vs39 + xxlor vs8, vs40, vs40 + xxlor vs9, vs41, vs41 + xxlor vs10, vs42, vs42 + xxlor vs11, vs43, vs43 + xxlor vs12, vs44, vs44 + xxlor vs13, vs45, vs45 + xxlor vs14, vs46, vs46 + xxlor vs15, vs47, vs47 +#else xxlor vs2, vs32, vs32 xxlor vs3, vs33, vs33 xxlor vs0, vs34, vs34 @@ -460,7 +592,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlor vs15, vs45, vs45 xxlor vs12, vs46, vs46 xxlor vs13, vs47, vs47 - +#endif SAVE4 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0 SAVE4 vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,T1,0 addi CO, CO, 64 @@ -488,12 +620,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_2 Index, IsLast lxvp vs40, DISP4(\Index, 32)(AO) // load real,imag from A lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B - xvf64gerpp 0, vs32, vs49 - xvf64gerpp 1, vs32, vs48 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 1, vs32, vs49 +#else + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs32, vs48 +#endif lxvp vs32, DISP4(\Index, 64)(AO) // load real,imag from A lxvp vs48, DISP4(\Index, 64)(BO) // load real imag from B - xvf64gerpp 0, vs40, vs51 - xvf64gerpp 1, vs40, vs50 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs40, vs50 + xvf64gerpp 1, vs40, vs51 +#else + xvf64gerpp 0, vs40, vs51 + xvf64gerpp 1, vs40, vs50 +#endif .if \IsLast==1 addi AO, AO, DISP4(\Index,64) addi BO, BO, DISP4(\Index,64) @@ -502,8 +644,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro LOAD_END_2x2 OffsetA,OffsetB - xvf64gerpp 0, vs32, vs49 - xvf64gerpp 1, vs32, vs48 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 1, vs32, vs49 +#else + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs32, vs48 +#endif addi BO, BO, \OffsetB addi AO, AO, \OffsetA .endm @@ -526,7 +673,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs37, vs4, vs5, 0b10 xxpermdi vs38, vs6, vs7, 0b01 xxpermdi vs39, vs6, vs7, 0b10 - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxlor vs0, vs32, vs32 + xxlor vs1, vs33, vs33 + xxlor vs2, vs34, vs34 + xxlor vs3, vs35, vs35 + xxlor vs4, vs36, vs36 + xxlor vs5, vs37, vs37 + xxlor vs6, vs38, vs38 + xxlor vs7, vs39, vs39 +#else xxlor vs2, vs32, vs32 xxlor vs3, vs33, vs33 xxlor vs0, vs34, vs34 @@ -535,7 +691,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlor vs7, vs37, vs37 xxlor vs4, vs38, vs38 xxlor vs5, vs39, vs39 - +#endif SAVE2 vs0,vs1,vs2,vs3,CO,0 SAVE2 vs4,vs5,vs6,vs7,T1,0 addi CO, CO, 32 @@ -702,14 +858,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvp vs44, DISP16(\Index, 192)(AO) // load real,imag from A lxvp vs46, DISP16(\Index, 224)(AO) // load real,imag from A lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B - xvf64gerpp 0, vs32, vs49 - xvf64gerpp 1, vs34, vs49 - xvf64gerpp 2, vs36, vs49 - xvf64gerpp 3, vs38, vs49 - xvf64gerpp 0, vs40, vs48 - xvf64gerpp 1, vs42, vs48 - xvf64gerpp 2, vs44, vs48 - xvf64gerpp 3, vs46, vs48 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 1, vs34, vs48 + xvf64gerpp 2, vs36, vs48 + xvf64gerpp 3, vs38, vs48 + xvf64gerpp 0, vs40, vs49 + xvf64gerpp 1, vs42, vs49 + xvf64gerpp 2, vs44, vs49 + xvf64gerpp 3, vs46, vs49 +#else + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 2, vs36, vs49 + xvf64gerpp 3, vs38, vs49 + xvf64gerpp 0, vs40, vs48 + xvf64gerpp 1, vs42, vs48 + xvf64gerpp 2, vs44, vs48 + xvf64gerpp 3, vs46, vs48 +#endif .if \IsLast==1 addi AO, AO, DISP16(\Index,256) addi BO, BO, DISP2(\Index,32) @@ -758,7 +925,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs45, vs12, vs13, 0b10 xxpermdi vs46, vs14, vs15, 0b01 xxpermdi vs47, vs14, vs15, 0b10 - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxlor vs0, vs32, vs32 + xxlor vs1, vs33, vs33 + xxlor vs2, vs34, vs34 + xxlor vs3, vs35, vs35 + xxlor vs4, vs36, vs36 + xxlor vs5, vs37, vs37 + xxlor vs6, vs38, vs38 + xxlor vs7, vs39, vs39 + xxlor vs8, vs40, vs40 + xxlor vs9, vs41, vs41 + xxlor vs10, vs42, vs42 + xxlor vs11, vs43, vs43 + xxlor vs12, vs44, vs44 + xxlor vs13, vs45, vs45 + xxlor vs14, vs46, vs46 + xxlor vs15, vs47, vs47 +#else xxlor vs2, vs32, vs32 xxlor vs3, vs33, vs33 xxlor vs0, vs34, vs34 @@ -775,7 +959,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlor vs15, vs45, vs45 xxlor vs12, vs46, vs46 xxlor vs13, vs47, vs47 - +#endif SAVE8 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0 addi CO, CO, 128 .endm @@ -799,10 +983,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvp vs40, DISP8(\Index, 64)(AO) // load real,imag from A lxvp vs42, DISP8(\Index, 96)(AO) // load real,imag from A lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B - xvf64gerpp 0, vs32, vs49 - xvf64gerpp 1, vs34, vs49 - xvf64gerpp 0, vs40, vs48 - xvf64gerpp 1, vs42, vs48 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 1, vs34, vs48 + xvf64gerpp 0, vs40, vs49 + xvf64gerpp 1, vs42, vs49 +#else + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 0, vs40, vs48 + xvf64gerpp 1, vs42, vs48 +#endif .if \IsLast==1 addi AO, AO, DISP8(\Index,128) addi BO, BO, DISP2(\Index,32) @@ -837,7 +1028,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs37, vs4, vs5, 0b10 xxpermdi vs38, vs6, vs7, 0b01 xxpermdi vs39, vs6, vs7, 0b10 - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxlor vs0, vs32, vs32 + xxlor vs1, vs33, vs33 + xxlor vs2, vs34, vs34 + xxlor vs3, vs35, vs35 + xxlor vs4, vs36, vs36 + xxlor vs5, vs37, vs37 + xxlor vs6, vs38, vs38 + xxlor vs7, vs39, vs39 +#else xxlor vs2, vs32, vs32 xxlor vs3, vs33, vs33 xxlor vs0, vs34, vs34 @@ -846,7 +1046,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlor vs7, vs37, vs37 xxlor vs4, vs38, vs38 xxlor vs5, vs39, vs39 - +#endif SAVE4 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0 addi CO, CO, 64 .endm @@ -867,8 +1067,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvp vs32, DISP4(\Index, 0)(AO) // load real,imag from A lxvp vs40, DISP4(\Index, 32)(AO) // load real,imag from A lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B - xvf64gerpp 0, vs32, vs49 - xvf64gerpp 0, vs40, vs48 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 0, vs40, vs49 +#else + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 0, vs40, vs48 +#endif .if \IsLast==1 addi AO, AO, DISP4(\Index,64) addi BO, BO, DISP2(\Index,32) @@ -896,11 +1101,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs33, vs0, vs1, 0b10 xxpermdi vs34, vs2, vs3, 0b01 xxpermdi vs35, vs2, vs3, 0b10 - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxlor vs0, vs32, vs32 + xxlor vs1, vs33, vs33 + xxlor vs2, vs34, vs34 + xxlor vs3, vs35, vs35 +#else xxlor vs2, vs32, vs32 xxlor vs3, vs33, vs33 xxlor vs0, vs34, vs34 xxlor vs1, vs35, vs35 +#endif SAVE2 vs0,vs1,vs2,vs3,CO,0 addi CO, CO, 32 diff --git a/kernel/power/zgemv_n_4.c b/kernel/power/zgemv_n_4.c index 1f7199c89..366c21681 100644 --- a/kernel/power/zgemv_n_4.c +++ b/kernel/power/zgemv_n_4.c @@ -607,7 +607,6 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) { BLASLONG i; - BLASLONG j; FLOAT *a_ptr; FLOAT *x_ptr; FLOAT *y_ptr; diff --git a/kernel/power/zgemv_n_power10.c b/kernel/power/zgemv_n_power10.c index f5bb8d70e..a545b00d8 100644 --- a/kernel/power/zgemv_n_power10.c +++ b/kernel/power/zgemv_n_power10.c @@ -738,7 +738,6 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) { BLASLONG i; - BLASLONG j; FLOAT *a_ptr; FLOAT *x_ptr; FLOAT *y_ptr; diff --git a/kernel/power/zgemv_t_4.c b/kernel/power/zgemv_t_4.c index d3bf60ca7..e42eafaba 100644 --- a/kernel/power/zgemv_t_4.c +++ b/kernel/power/zgemv_t_4.c @@ -43,7 +43,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #elif HAVE_KERNEL_4x4_VEC -#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +#if defined(POWER10) typedef __vector unsigned char vec_t; typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); diff --git a/kernel/power/zscal.c b/kernel/power/zscal.c index 59ddc149f..0068138e8 100644 --- a/kernel/power/zscal.c +++ b/kernel/power/zscal.c @@ -43,16 +43,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) #include "zscal_microk_power8.c" #endif -#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +#elif defined(POWER10) #if defined(DOUBLE) #include "zscal_microk_power10.c" #else #include "cscal_microk_power10.c" #endif -#elif defined(POWER10) -#if defined(DOUBLE) -#include "zscal_microk_power8.c" -#endif #endif #endif diff --git a/kernel/power/zscal_microk_power10.c b/kernel/power/zscal_microk_power10.c index 15b8323f4..af99b8648 100644 --- a/kernel/power/zscal_microk_power10.c +++ b/kernel/power/zscal_microk_power10.c @@ -42,7 +42,11 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i) "xsnegdp 33, %x10 \n\t" // -alpha_i XXSPLTD_S(32,%x9,0) // alpha_r , alpha_r +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + XXMRGHD_S(33,33, %x10) // -alpha_i , alpha_i +#else XXMRGHD_S(33,%x10, 33) // -alpha_i , alpha_i +#endif "lxvp 40, 0(%2) \n\t" "lxvp 42, 32(%2) \n\t" @@ -97,10 +101,17 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i) "xvadddp 49, 49, 39 \n\t" "xvadddp 50, 50, %x3 \n\t" "xvadddp 51, 51, %x4 \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 48, 0(%2) \n\t" + "stxv 49, 16(%2) \n\t" + "stxv 50, 32(%2) \n\t" + "stxv 51, 48(%2) \n\t" +#else "stxv 49, 0(%2) \n\t" "stxv 48, 16(%2) \n\t" "stxv 51, 32(%2) \n\t" "stxv 50, 48(%2) \n\t" +#endif "xvadddp 34, 34, %x5 \n\t" @@ -109,12 +120,17 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i) "xvadddp 36, 36, %x7 \n\t" "xvadddp 37, 37, %x8 \n\t" - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 34, 64(%2) \n\t" + "stxv 35, 80(%2) \n\t" + "stxv 36, 96(%2) \n\t" + "stxv 37, 112(%2) \n\t" +#else "stxv 35, 64(%2) \n\t" "stxv 34, 80(%2) \n\t" "stxv 37, 96(%2) \n\t" "stxv 36, 112(%2) \n\t" - +#endif "addi %2, %2, 128 \n\t" "addic. %1, %1, -8 \n\t" @@ -155,23 +171,34 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i) "xvadddp 50, 50, %x3 \n\t" "xvadddp 51, 51, %x4 \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 48, 0(%2) \n\t" + "stxv 49, 16(%2) \n\t" + "stxv 50, 32(%2) \n\t" + "stxv 51, 48(%2) \n\t" +#else "stxv 49, 0(%2) \n\t" "stxv 48, 16(%2) \n\t" "stxv 51, 32(%2) \n\t" "stxv 50, 48(%2) \n\t" - +#endif "xvadddp 34, 34, %x5 \n\t" "xvadddp 35, 35, %x6 \n\t" "xvadddp 36, 36, %x7 \n\t" "xvadddp 37, 37, %x8 \n\t" - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 34, 64(%2) \n\t" + "stxv 35, 80(%2) \n\t" + "stxv 36, 96(%2) \n\t" + "stxv 37, 112(%2) \n\t" +#else "stxv 35, 64(%2) \n\t" "stxv 34, 80(%2) \n\t" "stxv 37, 96(%2) \n\t" "stxv 36, 112(%2) \n\t" - +#endif "#n=%1 x=%0=%2 alpha=(%9,%10) \n" : "+m" (*x), diff --git a/kernel/power/zswap.c b/kernel/power/zswap.c index 908802b71..fe7871852 100644 --- a/kernel/power/zswap.c +++ b/kernel/power/zswap.c @@ -39,10 +39,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "zswap_microk_power8.c" -#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +#elif defined(POWER10) #include "cswap_microk_power10.c" -#elif defined(POWER10) -#include "zswap_microk_power8.c" #endif #endif diff --git a/kernel/x86_64/KERNEL.COOPERLAKE b/kernel/x86_64/KERNEL.COOPERLAKE index 6272dd73d..dba94aea8 100644 --- a/kernel/x86_64/KERNEL.COOPERLAKE +++ b/kernel/x86_64/KERNEL.COOPERLAKE @@ -9,3 +9,14 @@ SBGEMM_SMALL_K_TN = sbgemm_small_kernel_tn_cooperlake.c SBGEMM_SMALL_K_B0_TN = sbgemm_small_kernel_tn_cooperlake.c SBGEMM_SMALL_K_TT = sbgemm_small_kernel_tt_cooperlake.c SBGEMM_SMALL_K_B0_TT = sbgemm_small_kernel_tt_cooperlake.c + +SBGEMM_BETA = sgemm_beta_skylakex.c +SBGEMMKERNEL = sbgemm_kernel_16x4_cooperlake.c +SBGEMMINCOPY = sbgemm_ncopy_16_cooperlake.c +SBGEMMITCOPY = sbgemm_tcopy_16_cooperlake.c +SBGEMMONCOPY = sbgemm_ncopy_4_cooperlake.c +SBGEMMOTCOPY = sbgemm_tcopy_4_cooperlake.c +SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) +SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) +SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) +SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/x86_64/bf16_common_macros.h b/kernel/x86_64/bf16_common_macros.h index 78db7abb2..cdb4beff6 100644 --- a/kernel/x86_64/bf16_common_macros.h +++ b/kernel/x86_64/bf16_common_macros.h @@ -56,25 +56,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define BF16_MATRIX_LOAD_8x16(regArray, a, lda, idx_m, idx_n) \ - regArray##_0 = _mm256_loadu_si256(&a[(idx_m+0)*lda + idx_n]); \ - regArray##_1 = _mm256_loadu_si256(&a[(idx_m+1)*lda + idx_n]); \ - regArray##_2 = _mm256_loadu_si256(&a[(idx_m+2)*lda + idx_n]); \ - regArray##_3 = _mm256_loadu_si256(&a[(idx_m+3)*lda + idx_n]); \ - regArray##_4 = _mm256_loadu_si256(&a[(idx_m+4)*lda + idx_n]); \ - regArray##_5 = _mm256_loadu_si256(&a[(idx_m+5)*lda + idx_n]); \ - regArray##_6 = _mm256_loadu_si256(&a[(idx_m+6)*lda + idx_n]); \ - regArray##_7 = _mm256_loadu_si256(&a[(idx_m+7)*lda + idx_n]); + regArray##_0 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+0)*lda + idx_n])); \ + regArray##_1 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+1)*lda + idx_n])); \ + regArray##_2 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+2)*lda + idx_n])); \ + regArray##_3 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+3)*lda + idx_n])); \ + regArray##_4 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+4)*lda + idx_n])); \ + regArray##_5 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+5)*lda + idx_n])); \ + regArray##_6 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+6)*lda + idx_n])); \ + regArray##_7 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+7)*lda + idx_n])); #define BF16_MATRIX_LOAD_8x8(regArray, a, lda, idx_m, idx_n) \ - regArray##_0 = _mm_loadu_si128(&a[(idx_m+0)*lda + idx_n]); \ - regArray##_1 = _mm_loadu_si128(&a[(idx_m+1)*lda + idx_n]); \ - regArray##_2 = _mm_loadu_si128(&a[(idx_m+2)*lda + idx_n]); \ - regArray##_3 = _mm_loadu_si128(&a[(idx_m+3)*lda + idx_n]); \ - regArray##_4 = _mm_loadu_si128(&a[(idx_m+4)*lda + idx_n]); \ - regArray##_5 = _mm_loadu_si128(&a[(idx_m+5)*lda + idx_n]); \ - regArray##_6 = _mm_loadu_si128(&a[(idx_m+6)*lda + idx_n]); \ - regArray##_7 = _mm_loadu_si128(&a[(idx_m+7)*lda + idx_n]); + regArray##_0 = _mm_loadu_si128((__m128i *)(&a[(idx_m+0)*lda + idx_n])); \ + regArray##_1 = _mm_loadu_si128((__m128i *)(&a[(idx_m+1)*lda + idx_n])); \ + regArray##_2 = _mm_loadu_si128((__m128i *)(&a[(idx_m+2)*lda + idx_n])); \ + regArray##_3 = _mm_loadu_si128((__m128i *)(&a[(idx_m+3)*lda + idx_n])); \ + regArray##_4 = _mm_loadu_si128((__m128i *)(&a[(idx_m+4)*lda + idx_n])); \ + regArray##_5 = _mm_loadu_si128((__m128i *)(&a[(idx_m+5)*lda + idx_n])); \ + regArray##_6 = _mm_loadu_si128((__m128i *)(&a[(idx_m+6)*lda + idx_n])); \ + regArray##_7 = _mm_loadu_si128((__m128i *)(&a[(idx_m+7)*lda + idx_n])); #define BF16_MATRIX_LOAD_1x32(regArray, a, lda, idx_m, idx_n) \ @@ -153,11 +153,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define BF16_VECTOR_LOAD_1x16(reg, x, idx_n) \ - reg = _mm256_loadu_si256(x + idx_n); + reg = _mm256_loadu_si256((__m256i *)(x + idx_n)); #define BF16_VECTOR_LOAD_1x8(reg, x, idx_n) \ - reg = _mm_loadu_si128(x + idx_n); + reg = _mm_loadu_si128((__m128i *)(x + idx_n)); #define BF16_VECTOR_MASKZ_LOAD_1x32(reg, x, idx_n, mask) \ diff --git a/kernel/x86_64/casum_microk_skylakex-2.c b/kernel/x86_64/casum_microk_skylakex-2.c index d51929f9f..b398aa6e1 100644 --- a/kernel/x86_64/casum_microk_skylakex-2.c +++ b/kernel/x86_64/casum_microk_skylakex-2.c @@ -15,7 +15,7 @@ static FLOAT casum_kernel(BLASLONG n, FLOAT *x) if (n2 < 64) { __m128 accum_10, accum_11, accum_12, accum_13; - __m128 abs_mask1; + __m128 abs_mask1 = abs_mask1; accum_10 = _mm_setzero_ps(); accum_11 = _mm_setzero_ps(); diff --git a/kernel/x86_64/dasum_microk_haswell-2.c b/kernel/x86_64/dasum_microk_haswell-2.c index 4fc73ddd4..fd9da7ebe 100644 --- a/kernel/x86_64/dasum_microk_haswell-2.c +++ b/kernel/x86_64/dasum_microk_haswell-2.c @@ -38,10 +38,10 @@ static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1) __m256i abs_mask = _mm256_set1_epi64x(0x7fffffffffffffff); for (i = 0; i < tail_index_AVX2; i += 16) { - accum_0 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 0]), abs_mask); - accum_1 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 4]), abs_mask); - accum_2 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 8]), abs_mask); - accum_3 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+12]), abs_mask); + accum_0 += (__m256d)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 0]), abs_mask); + accum_1 += (__m256d)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 4]), abs_mask); + accum_2 += (__m256d)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 8]), abs_mask); + accum_3 += (__m256d)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+12]), abs_mask); } accum_0 = accum_0 + accum_1 + accum_2 + accum_3; @@ -63,10 +63,10 @@ static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1) __m128i abs_mask2 = _mm_set1_epi64x(0x7fffffffffffffff); for (i = tail_index_AVX2; i < tail_index_SSE; i += 8) { - accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2); - accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 2]), abs_mask2); - accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2); - accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 6]), abs_mask2); + accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 0]), abs_mask2); + accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 2]), abs_mask2); + accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 4]), abs_mask2); + accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 6]), abs_mask2); } accum_20 = accum_20 + accum_21 + accum_22 + accum_23; diff --git a/kernel/x86_64/dasum_microk_skylakex-2.c b/kernel/x86_64/dasum_microk_skylakex-2.c index aea8c02d9..83bc078b3 100644 --- a/kernel/x86_64/dasum_microk_skylakex-2.c +++ b/kernel/x86_64/dasum_microk_skylakex-2.c @@ -58,10 +58,10 @@ static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1) __m128i abs_mask2 = _mm_set1_epi64x(0x7fffffffffffffff); for (i = tail_index_AVX512; i < tail_index_SSE; i += 8) { - accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2); - accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 2]), abs_mask2); - accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2); - accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 6]), abs_mask2); + accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 0]), abs_mask2); + accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 2]), abs_mask2); + accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 4]), abs_mask2); + accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 6]), abs_mask2); } accum_20 = accum_20 + accum_21 + accum_22 + accum_23; diff --git a/kernel/x86_64/sasum_microk_haswell-2.c b/kernel/x86_64/sasum_microk_haswell-2.c index 8e6cb9a47..2eb5b9538 100644 --- a/kernel/x86_64/sasum_microk_haswell-2.c +++ b/kernel/x86_64/sasum_microk_haswell-2.c @@ -38,10 +38,10 @@ static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1) __m256i abs_mask = _mm256_set1_epi32(0x7fffffff); for (i = 0; i < tail_index_AVX2; i += 32) { - accum_0 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+ 0]), abs_mask); - accum_1 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+ 8]), abs_mask); - accum_2 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+16]), abs_mask); - accum_3 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+24]), abs_mask); + accum_0 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 0]), abs_mask); + accum_1 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 8]), abs_mask); + accum_2 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+16]), abs_mask); + accum_3 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+24]), abs_mask); } accum_0 = accum_0 + accum_1 + accum_2 + accum_3; @@ -62,8 +62,8 @@ static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1) __m128i abs_mask2 = _mm_set1_epi32(0x7fffffff); for (i = tail_index_AVX2; i < tail_index_SSE; i += 8) { - accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2); - accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2); + accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 0]), abs_mask2); + accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 4]), abs_mask2); } accum_20 += accum_21; diff --git a/kernel/x86_64/sasum_microk_skylakex-2.c b/kernel/x86_64/sasum_microk_skylakex-2.c index c8c69d1e0..fbc91b558 100644 --- a/kernel/x86_64/sasum_microk_skylakex-2.c +++ b/kernel/x86_64/sasum_microk_skylakex-2.c @@ -53,8 +53,8 @@ static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1) __m128i abs_mask2 = _mm_set1_epi32(0x7fffffff); for (i = tail_index_AVX512; i < tail_index_SSE; i += 8) { - accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2); - accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2); + accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 0]), abs_mask2); + accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 4]), abs_mask2); } accum_20 += accum_21; diff --git a/kernel/x86_64/sbdot_microk_cooperlake.c b/kernel/x86_64/sbdot_microk_cooperlake.c index 067726cb1..2aefe46ff 100644 --- a/kernel/x86_64/sbdot_microk_cooperlake.c +++ b/kernel/x86_64/sbdot_microk_cooperlake.c @@ -79,21 +79,21 @@ static float sbdot_accl_kernel(BLASLONG n, bfloat16 *x, bfloat16 *y) __m256 accum256_1 = _mm256_setzero_ps(); int tail_index_32 = n&(~31); for (int j = 0; j < tail_index_32; j += 32) { - accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256(&x[j+ 0]), (__m256bh) _mm256_loadu_si256(&y[j+ 0])); - accum256_1 = _mm256_dpbf16_ps(accum256_1, (__m256bh) _mm256_loadu_si256(&x[j+16]), (__m256bh) _mm256_loadu_si256(&y[j+16])); + accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256((__m256i *)&x[j+ 0]), (__m256bh) _mm256_loadu_si256((__m256i *)&y[j+ 0])); + accum256_1 = _mm256_dpbf16_ps(accum256_1, (__m256bh) _mm256_loadu_si256((__m256i *)&x[j+16]), (__m256bh) _mm256_loadu_si256((__m256i *)&y[j+16])); } accum256 = _mm256_add_ps(accum256, accum256_1); /* Processing the remaining <32 chunk with 16-elements processing */ if ((n&16) != 0) { - accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256(&x[tail_index_32]), (__m256bh) _mm256_loadu_si256(&y[tail_index_32])); + accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256((__m256i *)&x[tail_index_32]), (__m256bh) _mm256_loadu_si256((__m256i *)&y[tail_index_32])); } accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf128_ps(accum256, 1)); /* Processing the remaining <16 chunk with 8-elements processing */ if ((n&8) != 0) { int tail_index_16 = n&(~15); - accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128(&x[tail_index_16]), (__m128bh) _mm_loadu_si128(&y[tail_index_16])); + accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128((__m128i *)&x[tail_index_16]), (__m128bh) _mm_loadu_si128((__m128i *)&y[tail_index_16])); } /* Processing the remaining <8 chunk with masked 8-elements processing */ @@ -108,13 +108,13 @@ static float sbdot_accl_kernel(BLASLONG n, bfloat16 *x, bfloat16 *y) } else if (n > 15) { /* n range from 16 to 31 */ /* Processing <32 chunk with 16-elements processing */ __m256 accum256 = _mm256_setzero_ps(); - accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256(&x[0]), (__m256bh) _mm256_loadu_si256(&y[0])); + accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256((__m256i *)&x[0]), (__m256bh) _mm256_loadu_si256((__m256i *)&y[0])); accum128 += _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf128_ps(accum256, 1)); /* Processing the remaining <16 chunk with 8-elements processing */ if ((n&8) != 0) { int tail_index_16 = n&(~15); - accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128(&x[tail_index_16]), (__m128bh) _mm_loadu_si128(&y[tail_index_16])); + accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128((__m128i *)&x[tail_index_16]), (__m128bh) _mm_loadu_si128((__m128i *)&y[tail_index_16])); } /* Processing the remaining <8 chunk with masked 8-elements processing */ @@ -128,7 +128,7 @@ static float sbdot_accl_kernel(BLASLONG n, bfloat16 *x, bfloat16 *y) } } else if (n > 7) { /* n range from 8 to 15 */ /* Processing <16 chunk with 8-elements processing */ - accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128(&x[0]), (__m128bh) _mm_loadu_si128(&y[0])); + accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128((__m128i *)&x[0]), (__m128bh) _mm_loadu_si128((__m128i *)&y[0])); /* Processing the remaining <8 chunk with masked 8-elements processing */ if ((n&7) != 0) { diff --git a/kernel/x86_64/sbgemm_block_microk_cooperlake.c b/kernel/x86_64/sbgemm_block_microk_cooperlake.c index 2c27221ac..b8c41f4f7 100644 --- a/kernel/x86_64/sbgemm_block_microk_cooperlake.c +++ b/kernel/x86_64/sbgemm_block_microk_cooperlake.c @@ -1246,7 +1246,7 @@ void COL_MAJOR_ITCOPY_KERNEL_Kx16(BLASLONG k, bfloat16 * A, BLASLONG lda, bfloat // K=Any number but will be processed based on 32, M<=16 void COL_MAJOR_ITCOPY_KERNEL_Kx16m(BLASLONG m, BLASLONG k, bfloat16 * A, BLASLONG lda, bfloat16 * block_A) { - bfloat16 * src_addr0, * src_addr1, * src_addr2, * src_addr3; + bfloat16 * src_addr0; bfloat16 * dst_addr0, * dst_addr1; BLASLONG tag_k_32x = k & (~31); diff --git a/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c new file mode 100644 index 000000000..7af51b6d8 --- /dev/null +++ b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c @@ -0,0 +1,499 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#define VMOVLDUP(addr, zmm) asm("vmovsldup (%1), %0": "=v"(zmm): "r"(addr)) +#define VMOVHDUP(addr, zmm) asm("vmovshdup (%1), %0": "=v"(zmm): "r"(addr)) +#define BROADCAST64(base, step, n, offset, zmm) \ + if (n == 0) asm("vbroadcastsd %c2(%1), %0": "=v"(zmm): "r"(base), "n"(offset*2)); \ + else asm("vbroadcastsd %c4(%1, %2, %c3), %0": "=v"(zmm): "r"(base), "r"(step), "n"(n*2), "n"(offset*2)) + +#define DECLARE_A_PAIR(A) \ + __m512i A_lo_##A; __m512i A_hi_##A; + +#define LOAD_A_PAIR(A) \ + VMOVLDUP(ptr_a##A, A_lo_##A); \ + VMOVHDUP(ptr_a##A, A_hi_##A); + +#define MASK_LOAD_A_PAIR(A) { \ + __m512 tmp = _mm512_maskz_loadu_ps(mmask, ptr_a##A); \ + A_lo_##A = (__m512i) _mm512_moveldup_ps(tmp); \ + A_hi_##A = (__m512i) _mm512_movehdup_ps(tmp); \ +} + +#define LOAD_A_PAIR_TAIL(A) { \ + __m256i ymm = _mm256_loadu_si256((void *)ptr_a##A); \ + __m512 zmm = (__m512) _mm512_cvtepu16_epi32(ymm); \ + A_lo_##A = (__m512i) _mm512_moveldup_ps(zmm); \ + A_hi_##A = (__m512i) _mm512_movehdup_ps(zmm); \ +} + +#define MASK_LOAD_A_PAIR_TAIL(A) { \ + __m256i ymm = _mm256_maskz_loadu_epi16(mmask, ptr_a##A); \ + __m512 zmm = (__m512) _mm512_cvtepu16_epi32(ymm); \ + A_lo_##A = (__m512i) _mm512_moveldup_ps(zmm); \ + A_hi_##A = (__m512i) _mm512_movehdup_ps(zmm); \ +} + +#define DECLARE_B_PAIR() \ + __m512i B_lo; __m512i B_hi; + +#define PREFETCH_B_STEP 32 +#define PREFETCH_B(Bx, By) \ + if (By == 0) asm("prefetcht0 %c1(%0)": : "r"(ptr_b##Bx), "n"(PREFETCH_B_STEP * 2)); \ + else asm("prefetcht0 %c3(%0, %1, %c2)": : "r"(ptr_b##Bx), "r"(n_blksize), "n"(By*2), "n"(PREFETCH_B_STEP * 2)) + +#define BROADCAST_B_PAIR(Bx, By) \ + BROADCAST64(ptr_b##Bx, n_blksize, By, 0, B_lo); \ + BROADCAST64(ptr_b##Bx, n_blksize, By, 4, B_hi); + +#define MASK_BROADCAST_B_PAIR(Bx, x) {\ + __m128 xmm = _mm_maskz_loadu_ps(nmask, ptr_b##Bx); \ + B_lo = (__m512i) _mm512_broadcastsd_pd((__m128d) xmm); \ + B_hi = (__m512i) _mm512_broadcastsd_pd(_mm_permute_pd((__m128d) xmm, 0x1)); \ +} + +#define BROADCAST_B_PAIR_TAIL(Bx, By) {\ + __m128i xmm = (__m128i) _mm_load_sd((double *)(ptr_b##Bx + n_blksize * By)); \ + xmm = _mm_cvtepu16_epi32(xmm); \ + B_lo = _mm512_broadcast_i32x2(xmm); \ + B_hi = _mm512_broadcast_i32x2((__m128i) _mm_permute_pd((__m128d) xmm, 0x1)); \ +} + +#define MASK_BROADCAST_B_PAIR_TAIL(Bx, By) {\ + __m128i xmm = _mm_maskz_loadu_epi16(nmask, ptr_b##Bx + n_blksize * By); \ + xmm = _mm_cvtepu16_epi32(xmm); \ + B_lo = _mm512_broadcast_i32x2(xmm); \ + B_hi = _mm512_broadcast_i32x2((__m128i) _mm_permute_pd((__m128d) xmm, 0x1)); \ +} + +#define DECLARE_RESULT_4X(A, Bx, By) \ + __m512 result_00_##A##Bx##By = _mm512_setzero_ps(); \ + __m512 result_01_##A##Bx##By = _mm512_setzero_ps(); \ + __m512 result_10_##A##Bx##By = _mm512_setzero_ps(); \ + __m512 result_11_##A##Bx##By = _mm512_setzero_ps(); + +#define FMA(a, b, r) r = _mm512_dpbf16_ps(r, (__m512bh)a, (__m512bh)b) + +#define MATMUL_4X(A, Bx, By) \ + FMA(A_lo_##A, B_lo, result_00_##A##Bx##By); \ + FMA(A_hi_##A, B_lo, result_01_##A##Bx##By); \ + FMA(A_lo_##A, B_hi, result_10_##A##Bx##By); \ + FMA(A_hi_##A, B_hi, result_11_##A##Bx##By); + +#define _STORE_C_2nx16(addr, val0, val1) \ + asm("vfmadd213ps (%1), %2, %0": "+v"(val0) : "r"(addr), "v"(alpha_512)); \ + asm("vfmadd213ps (%1, %3, 4), %2, %0": "+v"(val1) : "r"(addr), "v"(alpha_512), "r"(ldc)); \ + asm("vmovups %0, (%1)": : "v"(val0), "r"(addr)); \ + asm("vmovups %0, (%1, %2, 4)": : "v"(val1), "r"(addr), "r"(ldc)) + +#define _MASK_STORE_C_2nx16(addr, val0, val1) \ + asm("vfmadd213ps (%1), %2, %0 %{%3%} ": "+v"(val0) : "r"(addr), "v"(alpha_512), "k"(mmask)); \ + asm("vfmadd213ps (%1, %3, 4), %2, %0 %{%4%}": "+v"(val1) : "r"(addr), "v"(alpha_512), "r"(ldc), "k"(mmask)); \ + asm("vmovups %0, (%1) %{%2%}": : "v"(val0), "r"(addr), "k"(mmask)); \ + asm("vmovups %0, (%1, %2, 4) %{%3%}": : "v"(val1), "r"(addr), "r"(ldc), "k"(mmask)) + +#define _REORDER_C_2X(result_0, result_1) { \ + __m512 tmp0, tmp1; \ + tmp0 = _mm512_unpacklo_ps(result_0, result_1); \ + tmp1 = _mm512_unpackhi_ps(result_0, result_1); \ + result_0 = (__m512) _mm512_unpacklo_pd((__m512d) tmp0, (__m512d) tmp1); \ + result_1 = (__m512) _mm512_unpackhi_pd((__m512d) tmp0, (__m512d) tmp1); \ +} + +#define _STORE_2X(ptr_c, result_0, result_1) {\ + _REORDER_C_2X(result_0, result_1) \ + _STORE_C_2nx16(ptr_c, result_0, result_1); \ + ptr_c += ldc * 2; \ +} + +#define _MASK_STORE_2X(ptr_c, result_0, result_1) {\ + _REORDER_C_2X(result_0, result_1) \ + _MASK_STORE_C_2nx16(ptr_c, result_0, result_1); \ + ptr_c += ldc * 2; \ +} + +#define STORE_4X(A, Bx, By) { \ + _STORE_2X(ptr_c##A, result_00_##A##Bx##By, result_01_##A##Bx##By); \ + _STORE_2X(ptr_c##A, result_10_##A##Bx##By, result_11_##A##Bx##By); \ +} + +#define MASK_STORE_4X(A, Bx, By) { \ + _MASK_STORE_2X(ptr_c##A, result_00_##A##Bx##By, result_01_##A##Bx##By); \ + _MASK_STORE_2X(ptr_c##A, result_10_##A##Bx##By, result_11_##A##Bx##By); \ +} + +#define _STORE_C_16(addr, val0) \ + asm("vfmadd213ps (%1), %2, %0": "+v"(val0) : "r"(addr), "v"(alpha_512)); \ + asm("vmovups %0, (%1)": : "v"(val0), "r"(addr)); + +#define _MASK_STORE_C_16(addr, val0) \ + asm("vfmadd213ps (%1), %2, %0 %{%3%} ": "+v"(val0) : "r"(addr), "v"(alpha_512), "k"(mmask)); \ + asm("vmovups %0, (%1) %{%2%}": : "v"(val0), "r"(addr), "k"(mmask)); + +#define N_STORE_4X(A, Bx, By) { \ + _REORDER_C_2X(result_00_##A##Bx##By, result_01_##A##Bx##By); \ + _REORDER_C_2X(result_10_##A##Bx##By, result_11_##A##Bx##By); \ + switch(n_count) { \ + case 3: _STORE_C_16(ptr_c + ldc * 2, result_10_##A##Bx##By); \ + case 2: _STORE_C_16(ptr_c + ldc * 1, result_01_##A##Bx##By); \ + case 1: _STORE_C_16(ptr_c + ldc * 0, result_00_##A##Bx##By); \ + } \ + ptr_c##A += ldc * n_count; \ +} + +#define N_MASK_STORE_4X(A, Bx, By) { \ + _REORDER_C_2X(result_00_##A##Bx##By, result_01_##A##Bx##By); \ + _REORDER_C_2X(result_10_##A##Bx##By, result_11_##A##Bx##By); \ + switch(n_count) { \ + case 3: _MASK_STORE_C_16(ptr_c + ldc * 2, result_10_##A##Bx##By); \ + case 2: _MASK_STORE_C_16(ptr_c + ldc * 1, result_01_##A##Bx##By); \ + case 1: _MASK_STORE_C_16(ptr_c + ldc * 0, result_00_##A##Bx##By); \ + } \ + ptr_c##A += ldc * n_count; \ +} + + +int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc) +{ + IFLOAT *ptr_a = A, *ptr_b = B; + IFLOAT *ptr_b0, *ptr_b1; + IFLOAT *ptr_a0, *ptr_a1; + FLOAT *ptr_c = C; + FLOAT *ptr_c0, *ptr_c1; + BLASLONG n_count = n; + BLASLONG m_count, k_count; + BLASLONG n_blksize = 4 * k; + BLASLONG cn_offset = 0; + __m512 alpha_512 = _mm512_broadcastss_ps(_mm_load_ss(&alpha)); + + for (; n_count > 23; n_count -= 24) { + IFLOAT *ptr_b00 = ptr_b; + IFLOAT *ptr_b10 = ptr_b + n_blksize * 3; + ptr_a0 = ptr_a; + ptr_c = C + cn_offset * ldc; + m_count = m; + for (; m_count > 15; m_count -= 16) { + ptr_b0 = ptr_b00; + ptr_b1 = ptr_b10; + DECLARE_A_PAIR(0); + DECLARE_B_PAIR(); + DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2); + DECLARE_RESULT_4X(0, 1, 0); DECLARE_RESULT_4X(0, 1, 1); DECLARE_RESULT_4X(0, 1, 2); + k_count = k; + for (; k_count > 3; k_count -=4) { + LOAD_A_PAIR(0); + _mm_prefetch(ptr_a0 + 128, _MM_HINT_T0); + ptr_a0 += 16 * 2; + BROADCAST_B_PAIR(0, 0); PREFETCH_B(0, 0); MATMUL_4X(0, 0, 0); + BROADCAST_B_PAIR(0, 1); PREFETCH_B(0, 1); MATMUL_4X(0, 0, 1); + BROADCAST_B_PAIR(0, 2); PREFETCH_B(0, 2); MATMUL_4X(0, 0, 2); + ptr_b0 += 4 * 2; + BROADCAST_B_PAIR(1, 0); PREFETCH_B(1, 0); MATMUL_4X(0, 1, 0); + BROADCAST_B_PAIR(1, 1); PREFETCH_B(1, 1); MATMUL_4X(0, 1, 1); + BROADCAST_B_PAIR(1, 2); PREFETCH_B(1, 2); MATMUL_4X(0, 1, 2); + ptr_b1 += 4 * 2; + + LOAD_A_PAIR(0); + _mm_prefetch(ptr_a0 + 128, _MM_HINT_T0); + ptr_a0 += 16 * 2; + BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); + BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); + BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); + ptr_b0 += 4 * 2; + BROADCAST_B_PAIR(1, 0); MATMUL_4X(0, 1, 0); + BROADCAST_B_PAIR(1, 1); MATMUL_4X(0, 1, 1); + BROADCAST_B_PAIR(1, 2); MATMUL_4X(0, 1, 2); + ptr_b1 += 4 * 2; + } + for (; k_count > 1; k_count -=2) { + LOAD_A_PAIR(0); + ptr_a0 += 16 * 2; + BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); + BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); + BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); + ptr_b0 += 4 * 2; + BROADCAST_B_PAIR(1, 0); MATMUL_4X(0, 1, 0); + BROADCAST_B_PAIR(1, 1); MATMUL_4X(0, 1, 1); + BROADCAST_B_PAIR(1, 2); MATMUL_4X(0, 1, 2); + ptr_b1 += 4 * 2; + } + if (k_count > 0) { + LOAD_A_PAIR_TAIL(0); + ptr_a0 += 16; + BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); + BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1); + BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2); + ptr_b0 += 4; + BROADCAST_B_PAIR_TAIL(1, 0); MATMUL_4X(0, 1, 0); + BROADCAST_B_PAIR_TAIL(1, 1); MATMUL_4X(0, 1, 1); + BROADCAST_B_PAIR_TAIL(1, 2); MATMUL_4X(0, 1, 2); + ptr_b1 += 4; + } + ptr_c0 = ptr_c; + STORE_4X(0, 0, 0); STORE_4X(0, 0, 1); STORE_4X(0, 0, 2); + STORE_4X(0, 1, 0); STORE_4X(0, 1, 1); STORE_4X(0, 1, 2); + ptr_c += 16; + } + if (m_count > 0) { + __mmask16 mmask = (1UL << m_count) - 1; + ptr_b0 = ptr_b00; + ptr_b1 = ptr_b10; + DECLARE_A_PAIR(0); + DECLARE_B_PAIR(); + DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2); + DECLARE_RESULT_4X(0, 1, 0); DECLARE_RESULT_4X(0, 1, 1); DECLARE_RESULT_4X(0, 1, 2); + for (k_count = k; k_count > 1; k_count -=2) { + MASK_LOAD_A_PAIR(0); + ptr_a0 += m_count * 2; + BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); + BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); + BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); + ptr_b0 += 4 * 2; + BROADCAST_B_PAIR(1, 0); MATMUL_4X(0, 1, 0); + BROADCAST_B_PAIR(1, 1); MATMUL_4X(0, 1, 1); + BROADCAST_B_PAIR(1, 2); MATMUL_4X(0, 1, 2); + ptr_b1 += 4 * 2; + } + if (k_count > 0) { + MASK_LOAD_A_PAIR_TAIL(0); + ptr_a0 += m_count; + BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); + BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1); + BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2); + ptr_b0 += 4; + BROADCAST_B_PAIR_TAIL(1, 0); MATMUL_4X(0, 1, 0); + BROADCAST_B_PAIR_TAIL(1, 1); MATMUL_4X(0, 1, 1); + BROADCAST_B_PAIR_TAIL(1, 2); MATMUL_4X(0, 1, 2); + ptr_b1 += 4; + } + ptr_c0 = ptr_c; + MASK_STORE_4X(0, 0, 0); MASK_STORE_4X(0, 0, 1); MASK_STORE_4X(0, 0, 2); + MASK_STORE_4X(0, 1, 0); MASK_STORE_4X(0, 1, 1); MASK_STORE_4X(0, 1, 2); + ptr_c += m_count; + } + ptr_b += 24 * k; + cn_offset += 24; + } + for (; n_count > 11; n_count -= 12) { + IFLOAT *ptr_b00 = ptr_b; + ptr_a0 = ptr_a; + ptr_a1 = ptr_a + 16 * k; + ptr_c = C + cn_offset * ldc; + m_count = m; + for (; m_count > 31; m_count -= 32) { + ptr_b0 = ptr_b00; + DECLARE_A_PAIR(0); DECLARE_A_PAIR(1); + DECLARE_B_PAIR(); + DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2); + DECLARE_RESULT_4X(1, 0, 0); DECLARE_RESULT_4X(1, 0, 1); DECLARE_RESULT_4X(1, 0, 2); + for (k_count = k; k_count > 1; k_count -=2) { + LOAD_A_PAIR(0); LOAD_A_PAIR(1); + ptr_a0 += 16 * 2; + ptr_a1 += 16 * 2; + BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); MATMUL_4X(1, 0, 0); + BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); MATMUL_4X(1, 0, 1); + BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); MATMUL_4X(1, 0, 2); + ptr_b0 += 4 * 2; + } + if (k_count > 0) { + LOAD_A_PAIR_TAIL(0); LOAD_A_PAIR_TAIL(1); + ptr_a0 += 16; + ptr_a1 += 16; + BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); MATMUL_4X(1, 0, 0); + BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1); MATMUL_4X(1, 0, 1); + BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2); MATMUL_4X(1, 0, 2); + ptr_b0 += 4; + } + ptr_c0 = ptr_c; + ptr_c1 = ptr_c + 16; + STORE_4X(0, 0, 0); STORE_4X(1, 0, 0); + STORE_4X(0, 0, 1); STORE_4X(1, 0, 1); + STORE_4X(0, 0, 2); STORE_4X(1, 0, 2); + ptr_c += 16 * 2; + ptr_a0 = ptr_a1; + ptr_a1 = ptr_a0 + 16 * k; + } + for (; m_count > 15; m_count -= 16) { + ptr_b0 = ptr_b00; + DECLARE_A_PAIR(0); + DECLARE_B_PAIR(); + DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2); + for (k_count = k; k_count > 1; k_count -=2) { + LOAD_A_PAIR(0); + ptr_a0 += 16 * 2; + BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); + BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); + BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); + ptr_b0 += 4 * 2; + } + if (k_count > 0) { + LOAD_A_PAIR_TAIL(0); + ptr_a0 += 16; + BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); + BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1); + BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2); + ptr_b0 += 4; + } + ptr_c0 = ptr_c; + STORE_4X(0, 0, 0); STORE_4X(0, 0, 1); STORE_4X(0, 0, 2); + ptr_c += 16; + } + if (m_count > 0) { + __mmask16 mmask = (1UL << m_count) - 1; + ptr_b0 = ptr_b00; + DECLARE_A_PAIR(0); + DECLARE_B_PAIR(); + DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2); + for (k_count = k; k_count > 1; k_count -=2) { + MASK_LOAD_A_PAIR(0); + ptr_a0 += m_count * 2; + BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); + BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); + BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); + ptr_b0 += 4 * 2; + } + if (k_count > 0) { + MASK_LOAD_A_PAIR_TAIL(0); + ptr_a0 += m_count; + BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); + BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1); + BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2); + ptr_b0 += 4; + } + ptr_c0 = ptr_c; + MASK_STORE_4X(0, 0, 0); MASK_STORE_4X(0, 0, 1); MASK_STORE_4X(0, 0, 2); + ptr_c += m_count; + } + ptr_b += 12 * k; + cn_offset += 12; + } + for (; n_count > 3; n_count -= 4) { + IFLOAT *ptr_b00 = ptr_b; + ptr_a0 = ptr_a; + ptr_c = C + cn_offset * ldc; + m_count = m; + for (; m_count > 15; m_count -= 16) { + ptr_b0 = ptr_b00; + DECLARE_A_PAIR(0); + DECLARE_B_PAIR(); + DECLARE_RESULT_4X(0, 0, 0); + for (k_count = k; k_count > 1; k_count -=2) { + LOAD_A_PAIR(0); + BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); + ptr_b0 += 4 * 2; + ptr_a0 += 16 * 2; + } + if (k_count > 0) { + LOAD_A_PAIR_TAIL(0); + BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); + ptr_b0 += 4; + ptr_a0 += 16; + } + ptr_c0 = ptr_c; + STORE_4X(0, 0, 0); + ptr_c += 16; + } + if (m_count > 0) { + __mmask16 mmask = (1UL << m_count) - 1; + ptr_b0 = ptr_b00; + DECLARE_A_PAIR(0); + DECLARE_B_PAIR(); + DECLARE_RESULT_4X(0, 0, 0); + for (k_count = k; k_count > 1; k_count -=2) { + MASK_LOAD_A_PAIR(0); + BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); + ptr_b0 += 4 * 2; + ptr_a0 += m_count * 2; + } + if (k_count > 0) { + MASK_LOAD_A_PAIR_TAIL(0); + BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); + ptr_b0 += 4; + ptr_a0 += m_count; + } + ptr_c0 = ptr_c; + MASK_STORE_4X(0, 0, 0); + ptr_c += m_count; + } + ptr_b += 4 * k; + cn_offset += 4; + } + if (n_count > 0) { + __mmask8 nmask = (1UL << n_count) - 1; + IFLOAT *ptr_b00 = ptr_b; + ptr_a0 = ptr_a; + ptr_c = C + cn_offset * ldc; + m_count = m; + for (; m_count > 15; m_count -= 16) { + ptr_b0 = ptr_b00; + DECLARE_A_PAIR(0); + DECLARE_B_PAIR(); + DECLARE_RESULT_4X(0, 0, 0); + for (k_count = k; k_count > 1; k_count -=2) { + LOAD_A_PAIR(0); + MASK_BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); + ptr_b0 += n_count * 2; + ptr_a0 += 16 * 2; + } + if (k_count > 0) { + LOAD_A_PAIR_TAIL(0); + MASK_BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); + ptr_b0 += n_count; + ptr_a0 += 16; + } + ptr_c0 = ptr_c; + N_STORE_4X(0, 0, 0); + ptr_c += 16; + } + if (m_count > 0) { + __mmask16 mmask = (1UL << m_count) - 1; + ptr_b0 = ptr_b00; + DECLARE_A_PAIR(0); + DECLARE_B_PAIR(); + DECLARE_RESULT_4X(0, 0, 0); + for (k_count = k; k_count > 1; k_count -=2) { + MASK_LOAD_A_PAIR(0); + MASK_BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); + ptr_b0 += n_count * 2; + ptr_a0 += m_count * 2; + } + if (k_count > 0) { + MASK_LOAD_A_PAIR_TAIL(0); + MASK_BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); + ptr_b0 += n_count; + ptr_a0 += m_count; + } + ptr_c0 = ptr_c; + N_MASK_STORE_4X(0, 0, 0); + ptr_c += m_count; + } + } + return 0; +} diff --git a/kernel/x86_64/sbgemm_ncopy_16_cooperlake.c b/kernel/x86_64/sbgemm_ncopy_16_cooperlake.c new file mode 100644 index 000000000..95ed82d7c --- /dev/null +++ b/kernel/x86_64/sbgemm_ncopy_16_cooperlake.c @@ -0,0 +1,353 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#include "common.h" + +#define _MM512_SHUFFLE_i32(result, in1, in2, imm8) \ + asm("vshufps %3, %2, %1, %0": "=v"(result): "v"(in1), "v"(in2), "N"(imm8)) + +#define REORDER_8x32(t0, t1, t2, t3, t4, t5, t6, t7) { \ + __m512i v; \ + t0 = _mm512_unpacklo_epi32(r0, r1); \ + t1 = _mm512_unpackhi_epi32(r0, r1); \ + t2 = _mm512_unpacklo_epi32(r2, r3); \ + t3 = _mm512_unpackhi_epi32(r2, r3); \ + t4 = _mm512_unpacklo_epi32(r4, r5); \ + t5 = _mm512_unpackhi_epi32(r4, r5); \ + t6 = _mm512_unpacklo_epi32(r6, r7); \ + t7 = _mm512_unpackhi_epi32(r6, r7); \ + _MM512_SHUFFLE_i32(v, t0, t2, 0x4E); \ + r0 = _mm512_mask_blend_epi32(kc, t0, v); \ + r1 = _mm512_mask_blend_epi32(k3, t2, v); \ + _MM512_SHUFFLE_i32(v, t1, t3, 0x4E); \ + r2 = _mm512_mask_blend_epi32(kc, t1, v); \ + r3 = _mm512_mask_blend_epi32(k3, t3, v); \ + _MM512_SHUFFLE_i32(v, t4, t6, 0x4E); \ + r4 = _mm512_mask_blend_epi32(kc, t4, v); \ + r5 = _mm512_mask_blend_epi32(k3, t6, v); \ + _MM512_SHUFFLE_i32(v, t5, t7, 0x4E); \ + r6 = _mm512_mask_blend_epi32(kc, t5, v); \ + r7 = _mm512_mask_blend_epi32(k3, t7, v); \ + t0 = _mm512_permutex2var_epi32(r0, idx_lo, r4); \ + t1 = _mm512_permutex2var_epi32(r1, idx_lo, r5); \ + t2 = _mm512_permutex2var_epi32(r2, idx_lo, r6); \ + t3 = _mm512_permutex2var_epi32(r3, idx_lo, r7); \ + t4 = _mm512_permutex2var_epi32(r0, idx_hi, r4); \ + t5 = _mm512_permutex2var_epi32(r1, idx_hi, r5); \ + t6 = _mm512_permutex2var_epi32(r2, idx_hi, r6); \ + t7 = _mm512_permutex2var_epi32(r3, idx_hi, r7); \ +} + +#define STORE_512_LO(x) \ + v = _mm512_permutex2var_epi64(t0##x, idx_lo2, t1##x); \ + _mm512_storeu_si512(boffset0 + x*32, v); + +#define STORE_512_HI(x) \ + v = _mm512_permutex2var_epi64(t0##x, idx_hi2, t1##x); \ + _mm512_storeu_si512(boffset0 + (x + 8)*32, v); + +#define MASK_STORE_512_LO(x) \ + v = _mm512_permutex2var_epi64(t0##x, idx_lo2, t1##x); \ + _mm512_mask_storeu_epi32(boffset0 + 2*x*remain_n, nmask, v); + +#define MASK_STORE_512_HI(x) \ + v = _mm512_permutex2var_epi64(t0##x, idx_hi2, t1##x); \ + _mm512_mask_storeu_epi32(boffset0 + 2*(x + 8)*remain_n, nmask, v); + +#define STORE_512(x, y) {\ + __m512i v; \ + if (x == 0) { STORE_512_LO(y); } \ + else { STORE_512_HI(y); } \ +} + +#define MASK_STORE_512(x, y) {\ + __m512i v; \ + if (x == 0) { MASK_STORE_512_LO(y); } \ + else { MASK_STORE_512_HI(y); } \ +} + +#define SET_TAIL(y, x) {\ + if (y == 0) tail = _mm512_permutex2var_epi64(t0##x, idx_lo2, t1##x); \ + else tail = _mm512_permutex2var_epi64(t0##x, idx_hi2, t1##x); \ +} + +#define GET_TAIL() \ + switch (n_store + 1) { \ + case 16: SET_TAIL(1, 7); break; \ + case 15: SET_TAIL(1, 6); break; \ + case 14: SET_TAIL(1, 5); break; \ + case 13: SET_TAIL(1, 4); break; \ + case 12: SET_TAIL(1, 3); break; \ + case 11: SET_TAIL(1, 2); break; \ + case 10: SET_TAIL(1, 1); break; \ + case 9: SET_TAIL(1, 0); break; \ + case 8: SET_TAIL(0, 7); break; \ + case 7: SET_TAIL(0, 6); break; \ + case 6: SET_TAIL(0, 5); break; \ + case 5: SET_TAIL(0, 4); break; \ + case 4: SET_TAIL(0, 3); break; \ + case 3: SET_TAIL(0, 2); break; \ + case 2: SET_TAIL(0, 1); break; \ + case 1: SET_TAIL(0, 0); break; \ + } + + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + BLASLONG i, j; + + IFLOAT *boffset0; + IFLOAT *aoffset; + IFLOAT *aoffset00, *aoffset01, *aoffset02, *aoffset03, *aoffset04, *aoffset05, *aoffset06, *aoffset07; + IFLOAT *aoffset10, *aoffset11, *aoffset12, *aoffset13, *aoffset14, *aoffset15, *aoffset16, *aoffset17; + aoffset = a; + boffset0 = b; + + BLASLONG n16 = n & ~15; + BLASLONG m32 = m & ~31; + + int permute_table[] = { + 0x0, 0x1, 0x2, 0x3, 0x10, 0x11, 0x12, 0x13, 0x8, 0x9, 0xa, 0xb, 0x18, 0x19, 0x1a, 0x1b, + 0x4, 0x5, 0x6, 0x7, 0x14, 0x15, 0x16, 0x17, 0xc, 0xd, 0xe, 0xf, 0x1c, 0x1d, 0x1e, 0x1f, + }; + u_int64_t permute_table2[] = { + 0x00, 0x01, 0x02, 0x03, 8|0x0, 8|0x1, 8|0x2, 8|0x3, + 0x04, 0x05, 0x06, 0x07, 8|0x4, 8|0x5, 8|0x6, 8|0x7, + }; + __m512i idx_lo = _mm512_loadu_si512(permute_table); + __m512i idx_hi = _mm512_loadu_si512(permute_table + 16); + __m512i idx_lo2 = _mm512_loadu_si512(permute_table2); + __m512i idx_hi2 = _mm512_loadu_si512(permute_table2 + 8); + __mmask16 kc = 0xcccc; + __mmask16 k3 = 0x3333; + __m512i r0, r1, r2, r3, r4, r5, r6, r7; + __m512i t00, t01, t02, t03, t04, t05, t06, t07; + __m512i t10, t11, t12, t13, t14, t15, t16, t17; + + for (j = 0; j < n16; j += 16) { + aoffset00 = aoffset; + aoffset01 = aoffset00 + lda; + aoffset02 = aoffset01 + lda; + aoffset03 = aoffset02 + lda; + aoffset04 = aoffset03 + lda; + aoffset05 = aoffset04 + lda; + aoffset06 = aoffset05 + lda; + aoffset07 = aoffset06 + lda; + aoffset10 = aoffset07 + lda; + aoffset11 = aoffset10 + lda; + aoffset12 = aoffset11 + lda; + aoffset13 = aoffset12 + lda; + aoffset14 = aoffset13 + lda; + aoffset15 = aoffset14 + lda; + aoffset16 = aoffset15 + lda; + aoffset17 = aoffset16 + lda; + aoffset += 16 * lda; + for (i = 0; i < m32; i += 32) { + r0 = _mm512_loadu_si512(aoffset00 + i); + r1 = _mm512_loadu_si512(aoffset01 + i); + r2 = _mm512_loadu_si512(aoffset02 + i); + r3 = _mm512_loadu_si512(aoffset03 + i); + r4 = _mm512_loadu_si512(aoffset04 + i); + r5 = _mm512_loadu_si512(aoffset05 + i); + r6 = _mm512_loadu_si512(aoffset06 + i); + r7 = _mm512_loadu_si512(aoffset07 + i); + REORDER_8x32(t00, t01, t02, t03, t04, t05, t06, t07); + r0 = _mm512_loadu_si512(aoffset10 + i); + r1 = _mm512_loadu_si512(aoffset11 + i); + r2 = _mm512_loadu_si512(aoffset12 + i); + r3 = _mm512_loadu_si512(aoffset13 + i); + r4 = _mm512_loadu_si512(aoffset14 + i); + r5 = _mm512_loadu_si512(aoffset15 + i); + r6 = _mm512_loadu_si512(aoffset16 + i); + r7 = _mm512_loadu_si512(aoffset17 + i); + REORDER_8x32(t10, t11, t12, t13, t14, t15, t16, t17); + STORE_512(0, 0); STORE_512(0, 1); STORE_512(0, 2); STORE_512(0, 3); + STORE_512(0, 4); STORE_512(0, 5); STORE_512(0, 6); STORE_512(0, 7); + STORE_512(1, 0); STORE_512(1, 1); STORE_512(1, 2); STORE_512(1, 3); + STORE_512(1, 4); STORE_512(1, 5); STORE_512(1, 6); STORE_512(1, 7); + boffset0 += 16 * 32; + } + if (i < m) { + int remain_m = m - i; + __mmask32 mmask = (1UL << remain_m) - 1; + r0 = _mm512_maskz_loadu_epi16(mmask, aoffset00 + i); + r1 = _mm512_maskz_loadu_epi16(mmask, aoffset01 + i); + r2 = _mm512_maskz_loadu_epi16(mmask, aoffset02 + i); + r3 = _mm512_maskz_loadu_epi16(mmask, aoffset03 + i); + r4 = _mm512_maskz_loadu_epi16(mmask, aoffset04 + i); + r5 = _mm512_maskz_loadu_epi16(mmask, aoffset05 + i); + r6 = _mm512_maskz_loadu_epi16(mmask, aoffset06 + i); + r7 = _mm512_maskz_loadu_epi16(mmask, aoffset07 + i); + REORDER_8x32(t00, t01, t02, t03, t04, t05, t06, t07); + r0 = _mm512_maskz_loadu_epi16(mmask, aoffset10 + i); + r1 = _mm512_maskz_loadu_epi16(mmask, aoffset11 + i); + r2 = _mm512_maskz_loadu_epi16(mmask, aoffset12 + i); + r3 = _mm512_maskz_loadu_epi16(mmask, aoffset13 + i); + r4 = _mm512_maskz_loadu_epi16(mmask, aoffset14 + i); + r5 = _mm512_maskz_loadu_epi16(mmask, aoffset15 + i); + r6 = _mm512_maskz_loadu_epi16(mmask, aoffset16 + i); + r7 = _mm512_maskz_loadu_epi16(mmask, aoffset17 + i); + REORDER_8x32(t10, t11, t12, t13, t14, t15, t16, t17); + int n_store = remain_m/2; + switch (n_store) { + case 15: STORE_512(1, 6); + case 14: STORE_512(1, 5); + case 13: STORE_512(1, 4); + case 12: STORE_512(1, 3); + case 11: STORE_512(1, 2); + case 10: STORE_512(1, 1); + case 9: STORE_512(1, 0); + case 8: STORE_512(0, 7); + case 7: STORE_512(0, 6); + case 6: STORE_512(0, 5); + case 5: STORE_512(0, 4); + case 4: STORE_512(0, 3); + case 3: STORE_512(0, 2); + case 2: STORE_512(0, 1); + case 1: STORE_512(0, 0); + } + boffset0 += n_store * 32; + if (m & 0x1) { + __m512i tail; + GET_TAIL(); + _mm256_storeu_si256((void *)boffset0, _mm512_cvtepi32_epi16(tail)); + boffset0 += 16; + } + } + + } + if (j < n) { + int remain_n = n - j; + __mmask16 nmask = (1UL << remain_n) - 1; + int load0, load1; + if (remain_n > 8) { + load0 = 8; + load1 = remain_n - 8; + } else { + load0 = remain_n; + load1 = 0; + } + aoffset00 = aoffset; + aoffset01 = aoffset00 + lda; + aoffset02 = aoffset01 + lda; + aoffset03 = aoffset02 + lda; + aoffset04 = aoffset03 + lda; + aoffset05 = aoffset04 + lda; + aoffset06 = aoffset05 + lda; + aoffset07 = aoffset06 + lda; + aoffset10 = aoffset07 + lda; + aoffset11 = aoffset10 + lda; + aoffset12 = aoffset11 + lda; + aoffset13 = aoffset12 + lda; + aoffset14 = aoffset13 + lda; + aoffset15 = aoffset14 + lda; + aoffset16 = aoffset15 + lda; + aoffset17 = aoffset16 + lda; + aoffset += 16 * lda; + for (i = 0; i < m32; i += 32) { + switch (load0) { + case 8: r7 = _mm512_loadu_si512(aoffset07 + i); + case 7: r6 = _mm512_loadu_si512(aoffset06 + i); + case 6: r5 = _mm512_loadu_si512(aoffset05 + i); + case 5: r4 = _mm512_loadu_si512(aoffset04 + i); + case 4: r3 = _mm512_loadu_si512(aoffset03 + i); + case 3: r2 = _mm512_loadu_si512(aoffset02 + i); + case 2: r1 = _mm512_loadu_si512(aoffset01 + i); + case 1: r0 = _mm512_loadu_si512(aoffset00 + i); + } + REORDER_8x32(t00, t01, t02, t03, t04, t05, t06, t07); + switch (load1) { + case 8: r7 = _mm512_loadu_si512(aoffset17 + i); + case 7: r6 = _mm512_loadu_si512(aoffset16 + i); + case 6: r5 = _mm512_loadu_si512(aoffset15 + i); + case 5: r4 = _mm512_loadu_si512(aoffset14 + i); + case 4: r3 = _mm512_loadu_si512(aoffset13 + i); + case 3: r2 = _mm512_loadu_si512(aoffset12 + i); + case 2: r1 = _mm512_loadu_si512(aoffset11 + i); + case 1: r0 = _mm512_loadu_si512(aoffset10 + i); + } + REORDER_8x32(t10, t11, t12, t13, t14, t15, t16, t17); + MASK_STORE_512(0, 0); MASK_STORE_512(0, 1); MASK_STORE_512(0, 2); MASK_STORE_512(0, 3); + MASK_STORE_512(0, 4); MASK_STORE_512(0, 5); MASK_STORE_512(0, 6); MASK_STORE_512(0, 7); + MASK_STORE_512(1, 0); MASK_STORE_512(1, 1); MASK_STORE_512(1, 2); MASK_STORE_512(1, 3); + MASK_STORE_512(1, 4); MASK_STORE_512(1, 5); MASK_STORE_512(1, 6); MASK_STORE_512(1, 7); + boffset0 += remain_n * 32; + } + if (i < m) { + int remain_m = m - i; + __mmask32 mmask = (1UL << remain_m) - 1; + switch (load0) { + case 8: r7 = _mm512_maskz_loadu_epi16(mmask, aoffset07 + i); + case 7: r6 = _mm512_maskz_loadu_epi16(mmask, aoffset06 + i); + case 6: r5 = _mm512_maskz_loadu_epi16(mmask, aoffset05 + i); + case 5: r4 = _mm512_maskz_loadu_epi16(mmask, aoffset04 + i); + case 4: r3 = _mm512_maskz_loadu_epi16(mmask, aoffset03 + i); + case 3: r2 = _mm512_maskz_loadu_epi16(mmask, aoffset02 + i); + case 2: r1 = _mm512_maskz_loadu_epi16(mmask, aoffset01 + i); + case 1: r0 = _mm512_maskz_loadu_epi16(mmask, aoffset00 + i); + } + REORDER_8x32(t00, t01, t02, t03, t04, t05, t06, t07); + switch (load1) { + case 8: r7 = _mm512_maskz_loadu_epi16(mmask, aoffset17 + i); + case 7: r6 = _mm512_maskz_loadu_epi16(mmask, aoffset16 + i); + case 6: r5 = _mm512_maskz_loadu_epi16(mmask, aoffset15 + i); + case 5: r4 = _mm512_maskz_loadu_epi16(mmask, aoffset14 + i); + case 4: r3 = _mm512_maskz_loadu_epi16(mmask, aoffset13 + i); + case 3: r2 = _mm512_maskz_loadu_epi16(mmask, aoffset12 + i); + case 2: r1 = _mm512_maskz_loadu_epi16(mmask, aoffset11 + i); + case 1: r0 = _mm512_maskz_loadu_epi16(mmask, aoffset10 + i); + } + REORDER_8x32(t10, t11, t12, t13, t14, t15, t16, t17); + int n_store = remain_m/2; + switch (n_store) { + case 15: MASK_STORE_512(1, 6); + case 14: MASK_STORE_512(1, 5); + case 13: MASK_STORE_512(1, 4); + case 12: MASK_STORE_512(1, 3); + case 11: MASK_STORE_512(1, 2); + case 10: MASK_STORE_512(1, 1); + case 9: MASK_STORE_512(1, 0); + case 8: MASK_STORE_512(0, 7); + case 7: MASK_STORE_512(0, 6); + case 6: MASK_STORE_512(0, 5); + case 5: MASK_STORE_512(0, 4); + case 4: MASK_STORE_512(0, 3); + case 3: MASK_STORE_512(0, 2); + case 2: MASK_STORE_512(0, 1); + case 1: MASK_STORE_512(0, 0); + } + boffset0 += n_store * remain_n * 2; + if (m & 0x1) { + __m512i tail; + GET_TAIL(); + _mm256_mask_storeu_epi16((void *)boffset0, nmask, _mm512_cvtepi32_epi16(tail)); + } + } + } + return 0; +} diff --git a/kernel/x86_64/sbgemm_ncopy_4_cooperlake.c b/kernel/x86_64/sbgemm_ncopy_4_cooperlake.c new file mode 100644 index 000000000..eefbd7355 --- /dev/null +++ b/kernel/x86_64/sbgemm_ncopy_4_cooperlake.c @@ -0,0 +1,208 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#include "common.h" + +#define REORDER_4x32(r0, r1, r2, r3) {\ + __m512i t0, t1, t2, t3; \ + t0 = _mm512_unpacklo_epi32(r0, r1); \ + t1 = _mm512_unpackhi_epi32(r0, r1); \ + t2 = _mm512_unpacklo_epi32(r2, r3); \ + t3 = _mm512_unpackhi_epi32(r2, r3); \ + r0 = _mm512_unpacklo_epi64(t0, t2); \ + r1 = _mm512_unpackhi_epi64(t0, t2); \ + r2 = _mm512_unpacklo_epi64(t1, t3); \ + r3 = _mm512_unpackhi_epi64(t1, t3); \ + t0 = _mm512_permutex2var_epi32(r0, idx_lo_128, r1); \ + t1 = _mm512_permutex2var_epi32(r0, idx_hi_128, r1); \ + t2 = _mm512_permutex2var_epi32(r2, idx_lo_128, r3); \ + t3 = _mm512_permutex2var_epi32(r2, idx_hi_128, r3); \ + r0 = _mm512_permutex2var_epi32(t0, idx_lo_256, t2); \ + r1 = _mm512_permutex2var_epi32(t1, idx_lo_256, t3); \ + r2 = _mm512_permutex2var_epi32(t0, idx_hi_256, t2); \ + r3 = _mm512_permutex2var_epi32(t1, idx_hi_256, t3); \ +} + +#define REORDER_4x8(r0, r1, r2, r3) {\ + __m128i t0, t1, t2, t3; \ + t0 = _mm_unpacklo_epi32(r0, r1); \ + t1 = _mm_unpackhi_epi32(r0, r1); \ + t2 = _mm_unpacklo_epi32(r2, r3); \ + t3 = _mm_unpackhi_epi32(r2, r3); \ + r0 = _mm_unpacklo_epi64(t0, t2); \ + r1 = _mm_unpackhi_epi64(t0, t2); \ + r2 = _mm_unpacklo_epi64(t1, t3); \ + r3 = _mm_unpackhi_epi64(t1, t3); \ +} + +#define GET_TAIL(tail, remain_m) \ + switch((remain_m + 1)/2) { \ + case 1: tail = r0; break; \ + case 2: tail = r1; break; \ + case 3: tail = r2; break; \ + case 4: tail = r3; break; \ + } + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + BLASLONG i, j; + IFLOAT *aoffset; + IFLOAT *aoffset0, *aoffset1, *aoffset2, *aoffset3; + + IFLOAT *boffset; + + aoffset = a; + boffset = b; + + BLASLONG m32 = m & ~31; + BLASLONG m8 = m & ~7; + BLASLONG n4 = n & ~3; + + int permute_table[] = { + 0x0, 0x1, 0x2, 0x3, 0x10, 0x11, 0x12, 0x13, 0x8, 0x9, 0xa, 0xb, 0x18, 0x19, 0x1a, 0x1b, + 0x4, 0x5, 0x6, 0x7, 0x14, 0x15, 0x16, 0x17, 0xc, 0xd, 0xe, 0xf, 0x1c, 0x1d, 0x1e, 0x1f, + 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + }; + __m512i idx_lo_128 = _mm512_loadu_si512(permute_table); + __m512i idx_hi_128 = _mm512_loadu_si512(permute_table + 16); + __m512i idx_lo_256 = _mm512_loadu_si512(permute_table + 32); + __m512i idx_hi_256 = _mm512_loadu_si512(permute_table + 48); + + for (j = 0; j < n4; j += 4) { + aoffset0 = aoffset; + aoffset1 = aoffset0 + lda; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset += 4 * lda; + + for (i = 0; i < m32; i += 32) { + __m512i r0, r1, r2, r3; + r0 = _mm512_loadu_si512(aoffset0 + i); + r1 = _mm512_loadu_si512(aoffset1 + i); + r2 = _mm512_loadu_si512(aoffset2 + i); + r3 = _mm512_loadu_si512(aoffset3 + i); + REORDER_4x32(r0, r1, r2, r3); + _mm512_storeu_si512(boffset + 32*0, r0); + _mm512_storeu_si512(boffset + 32*1, r1); + _mm512_storeu_si512(boffset + 32*2, r2); + _mm512_storeu_si512(boffset + 32*3, r3); + boffset += 32 * 4; + } + for (; i < m8; i += 8) { + __m128i r0 = _mm_loadu_si128((void *)(aoffset0 + i)); + __m128i r1 = _mm_loadu_si128((void *)(aoffset1 + i)); + __m128i r2 = _mm_loadu_si128((void *)(aoffset2 + i)); + __m128i r3 = _mm_loadu_si128((void *)(aoffset3 + i)); + REORDER_4x8(r0, r1, r2, r3); + _mm_storeu_si128((void *)(boffset + 8*0), r0); + _mm_storeu_si128((void *)(boffset + 8*1), r1); + _mm_storeu_si128((void *)(boffset + 8*2), r2); + _mm_storeu_si128((void *)(boffset + 8*3), r3); + boffset += 8 * 4; + } + if (i < m) { + int remain_m = m - i; + __mmask8 r_mask = (1UL << remain_m) - 1; + __m128i r0 = _mm_maskz_loadu_epi16(r_mask, aoffset0 + i); + __m128i r1 = _mm_maskz_loadu_epi16(r_mask, aoffset1 + i); + __m128i r2 = _mm_maskz_loadu_epi16(r_mask, aoffset2 + i); + __m128i r3 = _mm_maskz_loadu_epi16(r_mask, aoffset3 + i); + REORDER_4x8(r0, r1, r2, r3); + + // store should skip the tail odd line + int num_store = remain_m/2; + switch(num_store) { + case 3: _mm_storeu_si128((void *)(boffset + 8*2), r2); + case 2: _mm_storeu_si128((void *)(boffset + 8*1), r1); + case 1: _mm_storeu_si128((void *)(boffset + 8*0), r0); + } + boffset += 8 * num_store; + + if (m & 0x1) { // handling the tail + __m128i tail; + GET_TAIL(tail, remain_m); + /* tail vector is fill with zero like: + * a, 0, b, 0, c, 0, d, 0 + * need to extract lo words of data and store + */ + tail = _mm_cvtepi32_epi16(tail); + _mm_store_sd((double *)boffset, (__m128d) tail); // only lower 4 bfloat valid + boffset += 4; + } + } + } + if (j < n) { + int remain_n = n - j; + __mmask8 nmask = (1UL << remain_n) - 1; + aoffset0 = aoffset; + aoffset1 = aoffset0 + lda; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + __m128i r0, r1, r2, r3; + for (i = 0; i < m8; i += 8) { + switch (remain_n) { + case 3: r2 = _mm_loadu_si128((void *)(aoffset2 + i)); + case 2: r1 = _mm_loadu_si128((void *)(aoffset1 + i)); + case 1: r0 = _mm_loadu_si128((void *)(aoffset0 + i)); + } + REORDER_4x8(r0, r1, r2, r3); + _mm_mask_storeu_epi32(boffset + remain_n * 0, nmask, r0); + _mm_mask_storeu_epi32(boffset + remain_n * 2, nmask, r1); + _mm_mask_storeu_epi32(boffset + remain_n * 4, nmask, r2); + _mm_mask_storeu_epi32(boffset + remain_n * 6, nmask, r3); + boffset += 8 * remain_n; + } + if (i < m) { + int remain_m = m - i; + __mmask8 mmask = (1UL << remain_m) - 1; + switch (remain_n) { + case 3: r2 = _mm_maskz_loadu_epi16(mmask, aoffset2 + i); + case 2: r1 = _mm_maskz_loadu_epi16(mmask, aoffset1 + i); + case 1: r0 = _mm_maskz_loadu_epi16(mmask, aoffset0 + i); + } + REORDER_4x8(r0, r1, r2, r3); + + int num_store = remain_m/2; + switch (num_store) { + case 3: _mm_mask_storeu_epi32(boffset + remain_n * 4, nmask, r2); + case 2: _mm_mask_storeu_epi32(boffset + remain_n * 2, nmask, r1); + case 1: _mm_mask_storeu_epi32(boffset + remain_n * 0, nmask, r0); + } + boffset += 2 * num_store * remain_n; + + if (m & 0x1) { + __m128i tail; + GET_TAIL(tail, remain_m); + tail = _mm_cvtepi32_epi16(tail); + _mm_mask_storeu_epi16(boffset, nmask, tail); + } + } + } + return 0; +} diff --git a/kernel/x86_64/sbgemm_small_kernel_permit_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_permit_cooperlake.c index 823aafbdd..70becd9fa 100644 --- a/kernel/x86_64/sbgemm_small_kernel_permit_cooperlake.c +++ b/kernel/x86_64/sbgemm_small_kernel_permit_cooperlake.c @@ -38,5 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta) { - return 1; + double MNK = (double) M * (double) N * (double) K; + if (MNK > 256.0*256.0*256.0) // disable for big size matrix + return 0; + /* small matrix kernel works well for N = 8, 16, 32 */ + if (N == 8 || N == 16 || N == 32) + return 1; + return 0; } diff --git a/kernel/x86_64/sbgemm_tcopy_16_cooperlake.c b/kernel/x86_64/sbgemm_tcopy_16_cooperlake.c new file mode 100644 index 000000000..88725f343 --- /dev/null +++ b/kernel/x86_64/sbgemm_tcopy_16_cooperlake.c @@ -0,0 +1,164 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#include "common.h" + + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + BLASLONG i, j; + + IFLOAT *boffset0, *boffset1; + + boffset0 = b; + + BLASLONG n32 = n & ~31; + BLASLONG m4 = m & ~3; + BLASLONG m2 = m & ~1; + + uint32_t permute_table[] = { + 0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, + 0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b, 0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f, + }; + + __m512i idx_lo = _mm512_loadu_si512(permute_table); + __m512i idx_hi = _mm512_loadu_si512(permute_table + 16); + + for (j = 0; j < n32; j += 32) { + /* process 2x16 n at the same time */ + boffset1 = boffset0 + m * 16; + for (i = 0; i < m4; i += 4) { + /* bf16 fma need special memory layout: + * for memory layout like below: + * a00, a01, a02, a03, a04, a05 .... + * a10, a11, a12, a13, a14, a15 .... + * need to copy as: + * a00, a10, a01, a11, a02, a12, a03, a13, ... + */ + __m512i a0 = _mm512_loadu_si512(&a[(i + 0)*lda + j]); + __m512i a1 = _mm512_loadu_si512(&a[(i + 1)*lda + j]); + __m512i a2 = _mm512_loadu_si512(&a[(i + 2)*lda + j]); + __m512i a3 = _mm512_loadu_si512(&a[(i + 3)*lda + j]); + + __m512i a00 = _mm512_unpacklo_epi16(a0, a1); + __m512i a01 = _mm512_unpackhi_epi16(a0, a1); + __m512i a10 = _mm512_unpacklo_epi16(a2, a3); + __m512i a11 = _mm512_unpackhi_epi16(a2, a3); + + a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01); + a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01); + a2 = _mm512_permutex2var_epi32(a10, idx_lo, a11); + a3 = _mm512_permutex2var_epi32(a10, idx_hi, a11); + + _mm512_storeu_si512(boffset0, a0); + _mm512_storeu_si512(boffset1, a1); + _mm512_storeu_si512(boffset0 + 32, a2); + _mm512_storeu_si512(boffset1 + 32, a3); + boffset0 += 64; + boffset1 += 64; + } + for (; i < m2; i += 2) { + __m512i a0 = _mm512_loadu_si512(&a[(i + 0)*lda + j]); + __m512i a1 = _mm512_loadu_si512(&a[(i + 1)*lda + j]); + + __m512i a00 = _mm512_unpacklo_epi16(a0, a1); + __m512i a01 = _mm512_unpackhi_epi16(a0, a1); + + a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01); + a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01); + + _mm512_storeu_si512(boffset0, a0); + _mm512_storeu_si512(boffset1, a1); + boffset0 += 32; + boffset1 += 32; + } + for (; i < m; i++) { + /* just copy the only remains row */ + __m256i a0 = _mm256_loadu_si256((void *)&a[(i + 0)*lda + j]); + __m256i a1 = _mm256_loadu_si256((void *)&a[(i + 0)*lda + j + 16]); + _mm256_storeu_si256((void *)boffset0, a0); + _mm256_storeu_si256((void *)boffset1, a1); + boffset0 += 16; + boffset1 += 16; + } + boffset0 = boffset1; + } + if (j < n) { + uint32_t remains = n - j; + __mmask32 r_mask = (1UL << remains) - 1; + if (remains > 16) { + boffset1 = boffset0 + m * 16; + uint32_t tail1 = remains - 16; + __mmask16 w_mask1 = (1UL << tail1) - 1; + for (i = 0; i < m2; i += 2) { + __m512i a0 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]); + __m512i a1 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]); + + __m512i a00 = _mm512_unpacklo_epi16(a0, a1); + __m512i a01 = _mm512_unpackhi_epi16(a0, a1); + + a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01); + a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01); + + _mm512_storeu_si512(boffset0, a0); + _mm512_mask_storeu_epi32(boffset1, w_mask1, a1); + + boffset0 += 32; + boffset1 += 2 * tail1; + } + for (; i < m; i++) { + __m256i a0 = _mm256_loadu_si256((void *)&a[(i + 0)*lda + j]); + __m256i a1 = _mm256_maskz_loadu_epi16(w_mask1, (void *)&a[(i + 0)*lda + j + 16]); + _mm256_storeu_si256((void *)boffset0, a0); + _mm256_mask_storeu_epi16((void *)boffset1, w_mask1, a1); + boffset0 += 16; + boffset1 += tail1; + } + } else { + __mmask16 w_mask = (1UL << remains ) - 1; + for (i = 0; i < m2; i += 2) { + __m512i a0 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]); + __m512i a1 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]); + + __m512i a00 = _mm512_unpacklo_epi16(a0, a1); + __m512i a01 = _mm512_unpackhi_epi16(a0, a1); + + a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01); + + _mm512_mask_storeu_epi32(boffset0, w_mask, a0); + boffset0 += 2 * remains; + } + for (; i < m; i++) { + __m256i a0 = _mm256_maskz_loadu_epi16(w_mask, &a[(i + 0)*lda + j]); + _mm256_mask_storeu_epi16(boffset0, w_mask, a0); + boffset0 += remains; + } + } + } + return 0; +} diff --git a/kernel/x86_64/sbgemm_tcopy_4_cooperlake.c b/kernel/x86_64/sbgemm_tcopy_4_cooperlake.c new file mode 100644 index 000000000..e9edd4571 --- /dev/null +++ b/kernel/x86_64/sbgemm_tcopy_4_cooperlake.c @@ -0,0 +1,216 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#include "common.h" + +#define STORE_VEC(Bx, By, vec) \ + if (By == 0) asm("vmovdqu16 %0, (%1)": : "v"(vec), "r"(boffset##Bx)); \ + else asm("vmovdqu16 %0, (%1, %2, %c3)": : "v"(vec), "r"(boffset##Bx), "r"(blk_size), "n"(By * 2)); + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + BLASLONG i, j; + + IFLOAT *boffset0, *boffset1; + + boffset0 = b; + + BLASLONG n24 = n - (n % 24); + BLASLONG n8 = n & ~7; + BLASLONG m8 = m & ~7; + BLASLONG m4 = m & ~3; + BLASLONG m2 = m & ~1; + + int permute_table[] = { + 0x0, 0x1, 0x2, 0x3, 0x10, 0x11, 0x12, 0x13, 0x8, 0x9, 0xa, 0xb, 0x18, 0x19, 0x1a, 0x1b, + 0x4, 0x5, 0x6, 0x7, 0x14, 0x15, 0x16, 0x17, 0xc, 0xd, 0xe, 0xf, 0x1c, 0x1d, 0x1e, 0x1f, + 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + }; + + j = 0; + if (n > 23) { + /* n = 24 is the max width in current blocking setting */ + __m512i idx_lo_128 = _mm512_loadu_si512(permute_table); + __m512i idx_hi_128 = _mm512_loadu_si512(permute_table + 16); + __m512i idx_lo_256 = _mm512_loadu_si512(permute_table + 32); + __m512i idx_hi_256 = _mm512_loadu_si512(permute_table + 48); + __mmask32 mask24 = (1UL << 24) - 1; + BLASLONG blk_size = m * 4; + BLASLONG stride = blk_size * 3; + + for (; j < n24; j += 24) { + boffset1 = boffset0 + stride; + for (i = 0; i < m8; i += 8) { + __m512i r0, r1, r2, r3, r4, r5, r6, r7; + __m512i t0, t1, t2, t3, t4, t5, t6, t7; + r0 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 0)*lda + j]); + r1 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 1)*lda + j]); + r2 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 2)*lda + j]); + r3 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 3)*lda + j]); + r4 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 4)*lda + j]); + r5 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 5)*lda + j]); + r6 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 6)*lda + j]); + r7 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 7)*lda + j]); + + t0 = _mm512_unpacklo_epi16(r0, r1); + t1 = _mm512_unpackhi_epi16(r0, r1); + t2 = _mm512_unpacklo_epi16(r2, r3); + t3 = _mm512_unpackhi_epi16(r2, r3); + t4 = _mm512_unpacklo_epi16(r4, r5); + t5 = _mm512_unpackhi_epi16(r4, r5); + t6 = _mm512_unpacklo_epi16(r6, r7); + t7 = _mm512_unpackhi_epi16(r6, r7); + + r0 = _mm512_permutex2var_epi32(t0, idx_lo_128, t2); + r1 = _mm512_permutex2var_epi32(t1, idx_lo_128, t3); + r2 = _mm512_permutex2var_epi32(t4, idx_lo_128, t6); + r3 = _mm512_permutex2var_epi32(t5, idx_lo_128, t7); + r4 = _mm512_permutex2var_epi32(t0, idx_hi_128, t2); + r5 = _mm512_permutex2var_epi32(t1, idx_hi_128, t3); + r6 = _mm512_permutex2var_epi32(t4, idx_hi_128, t6); + r7 = _mm512_permutex2var_epi32(t5, idx_hi_128, t7); + + t0 = _mm512_permutex2var_epi32(r0, idx_lo_256, r2); + t1 = _mm512_permutex2var_epi32(r1, idx_lo_256, r3); + t2 = _mm512_permutex2var_epi32(r4, idx_lo_256, r6); + t3 = _mm512_permutex2var_epi32(r5, idx_lo_256, r7); + t4 = _mm512_permutex2var_epi32(r0, idx_hi_256, r2); + t5 = _mm512_permutex2var_epi32(r1, idx_hi_256, r3); + + STORE_VEC(0, 0, t0); STORE_VEC(0, 1, t1); STORE_VEC(0, 2, t2); + STORE_VEC(1, 0, t3); STORE_VEC(1, 1, t4); STORE_VEC(1, 2, t5); + boffset0 += 32; + boffset1 += 32; + } + for (; i < m2; i += 2) { + __m512i r0, r1, t0, t1; + r0 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 0)*lda + j]); + r1 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 1)*lda + j]); + t0 = _mm512_unpacklo_epi16(r0, r1); + t1 = _mm512_unpackhi_epi16(r0, r1); + STORE_VEC(0, 0, _mm512_extracti32x4_epi32(t0, 0)); + STORE_VEC(0, 1, _mm512_extracti32x4_epi32(t1, 0)); + STORE_VEC(0, 2, _mm512_extracti32x4_epi32(t0, 1)); + STORE_VEC(1, 0, _mm512_extracti32x4_epi32(t1, 1)); + STORE_VEC(1, 1, _mm512_extracti32x4_epi32(t0, 2)); + STORE_VEC(1, 2, _mm512_extracti32x4_epi32(t1, 2)); + boffset0 += 8; + boffset1 += 8; + } + for (; i < m; i++) { + *(uint64_t *)(boffset0 + blk_size * 0) = *(uint64_t *)&a[i * lda + j + 0]; + *(uint64_t *)(boffset0 + blk_size * 1) = *(uint64_t *)&a[i * lda + j + 4]; + *(uint64_t *)(boffset0 + blk_size * 2) = *(uint64_t *)&a[i * lda + j + 8]; + *(uint64_t *)(boffset1 + blk_size * 0) = *(uint64_t *)&a[i * lda + j + 12]; + *(uint64_t *)(boffset1 + blk_size * 1) = *(uint64_t *)&a[i * lda + j + 16]; + *(uint64_t *)(boffset1 + blk_size * 2) = *(uint64_t *)&a[i * lda + j + 20]; + boffset0 += 4; + boffset1 += 4; + } + boffset0 += stride * 2; + } + } + + for (; j < n8; j += 8) { + boffset1 = boffset0 + m * 4; + for (i = 0; i < m4; i += 4) { + __m128i a0 = _mm_loadu_si128((void *)&a[(i + 0)*lda + j]); + __m128i a1 = _mm_loadu_si128((void *)&a[(i + 1)*lda + j]); + __m128i a2 = _mm_loadu_si128((void *)&a[(i + 2)*lda + j]); + __m128i a3 = _mm_loadu_si128((void *)&a[(i + 3)*lda + j]); + __m128i a00 = _mm_unpacklo_epi16(a0, a1); + __m128i a01 = _mm_unpackhi_epi16(a0, a1); + __m128i a10 = _mm_unpacklo_epi16(a2, a3); + __m128i a11 = _mm_unpackhi_epi16(a2, a3); + _mm_storeu_si128((void *)(boffset0 + 0), a00); + _mm_storeu_si128((void *)(boffset0 + 8), a10); + _mm_storeu_si128((void *)(boffset1 + 0), a01); + _mm_storeu_si128((void *)(boffset1 + 8), a11); + boffset0 += 16; + boffset1 += 16; + } + for (; i < m2; i+= 2) { + __m128i a0 = _mm_loadu_si128((void *)&a[(i + 0)*lda + j]); + __m128i a1 = _mm_loadu_si128((void *)&a[(i + 1)*lda + j]); + __m128i a00 = _mm_unpacklo_epi16(a0, a1); + __m128i a01 = _mm_unpackhi_epi16(a0, a1); + _mm_storeu_si128((void *)(boffset0 + 0), a00); + _mm_storeu_si128((void *)(boffset1 + 0), a01); + boffset0 += 8; + boffset1 += 8; + } + for (; i < m; i++) { + __m128d a0 = _mm_loadu_pd((void *)&a[(i + 0)*lda + j]); + _mm_store_sd((void *)boffset0, a0); + _mm_store_sd((void *)boffset1, _mm_permute_pd(a0, 0x1)); + boffset0 += 4; + boffset1 += 4; + } + boffset0 = boffset1; + } + if (j < n) { + uint32_t remains = n - j; + __mmask8 r_mask = (1UL << remains) - 1; + if (remains > 4) { + boffset1 = boffset0 + m * 4; + uint32_t tail1 = remains - 4; + __mmask8 w_mask1 = (1UL << tail1) - 1; + for (i = 0; i < m2; i += 2) { + __m128i a0 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]); + __m128i a1 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]); + __m128i a00 = _mm_unpacklo_epi16(a0, a1); + __m128i a01 = _mm_unpackhi_epi16(a0, a1); + _mm_storeu_si128((void *)boffset0, a00); + _mm_mask_storeu_epi32((void *)boffset1, w_mask1, a01); + boffset0 += 8; + boffset1 += 2 * tail1; + } + for (; i < m; i++) { + __m128i a0 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]); + _mm_store_sd((void *)boffset0, (__m128d) a0); + _mm_mask_storeu_epi16((void *)boffset1, w_mask1, (__m128i) _mm_permute_pd((__m128d) a0, 0x1)); + boffset0 += 4; + boffset1 += tail1; + } + } else { + for (i = 0; i < m2; i += 2) { + __m128i a0 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]); + __m128i a1 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]); + __m128i a00 = _mm_unpacklo_epi16(a0, a1); + _mm_mask_storeu_epi32((void *)boffset0, r_mask, a00); + boffset0 += 2 * remains; + } + for (; i < m; i++) { + __m128i a0 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]); + _mm_mask_storeu_epi16((void *)boffset0, r_mask, a0); + } + } + } + return 0; +} diff --git a/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c b/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c index 46e6d0ff9..4711e9720 100644 --- a/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c +++ b/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c @@ -30,6 +30,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // Include common macros for BF16 based operations with IA intrinsics #include "bf16_common_macros.h" +#undef STORE16_COMPLETE_RESULT +#undef STORE16_MASK_COMPLETE_RESULT +#undef STORE8_COMPLETE_RESULT +#undef STORE8_MASK_COMPLETE_RESULT +#undef STORE4_COMPLETE_RESULT +#undef STORE4_MASK_COMPLETE_RESULT + #ifndef ZERO_BETA // Beta is non-zero #ifndef ONE_BETA // BETA is not ONE @@ -103,7 +110,9 @@ static int sbgemv_kernel_32xN_lda_direct(BLASLONG m, BLASLONG n, float alpha, bf __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i matrixArray_seed_0, matrixArray_seed_1, matrixArray_seed_2, matrixArray_seed_3; @@ -202,7 +211,7 @@ static int sbgemv_kernel_32xN_lda_direct(BLASLONG m, BLASLONG n, float alpha, bf unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(m&31))); __mmask32 tail_mask = *((__mmask32*) &tail_mask_value); - unsigned short store_tail_mask_value = (((unsigned int)0xffff) >> (16-(m&15))); + unsigned int store_tail_mask_value = (((unsigned int)0xffff) >> (16-(m&15))); __mmask32 store_tail_mask = *((__mmask32*) &store_tail_mask_value); accum512_0 = _mm512_setzero_ps(); diff --git a/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c b/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c index 51e681add..8a3a022fb 100644 --- a/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c +++ b/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c @@ -29,6 +29,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // Include common macros for BF16 based operations with IA intrinsics #include "bf16_common_macros.h" +#undef STORE16_COMPLETE_RESULT +#undef STORE16_MASK_COMPLETE_RESULT +#undef STORE8_COMPLETE_RESULT +#undef STORE8_MASK_COMPLETE_RESULT +#undef STORE4_COMPLETE_RESULT +#undef STORE4_MASK_COMPLETE_RESULT + #ifndef ZERO_BETA // Beta is non-zero #ifndef ONE_BETA // BETA is not ONE @@ -231,7 +238,9 @@ static int sbgemv_kernel_32x2(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif unsigned char load_mask_value = (((unsigned char)0xff) >> 6); @@ -280,7 +289,7 @@ static int sbgemv_kernel_32x2(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, } else if (tail_num == 8) { __m256 result256 = _mm256_setzero_ps(); - __m256i matrixArray256 = _mm256_loadu_si256(&a[(tag_m_32x)*2]); // Load 8 rows with n=2 + __m256i matrixArray256 = _mm256_loadu_si256((__m256i *)&a[(tag_m_32x)*2]); // Load 8 rows with n=2 __m256i xArray256 = _mm512_castsi512_si256(xArray); result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) xArray256); @@ -323,7 +332,9 @@ static int sbgemv_kernel_32x3(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif unsigned char x_load_mask_value = (((unsigned char)0xff) >> 5); @@ -395,9 +406,9 @@ static int sbgemv_kernel_32x3(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, result256_0 = _mm256_setzero_ps(); result256_1 = _mm256_setzero_ps(); - matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element - matrixArray256_1 = _mm256_loadu_si256(&a[((tag_m_32x+5)*3 + 1)]); // Load 5 rows with n=3 plus 1 element - matrixArray256_2 = _mm256_loadu_si256(&a[((tag_m_32x+10)*3 + 2)]); // Load 5 rows with n=3 plus 1 element + matrixArray256_0 = _mm256_loadu_si256((__m256i *)&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element + matrixArray256_1 = _mm256_loadu_si256((__m256i *)&a[((tag_m_32x+5)*3 + 1)]); // Load 5 rows with n=3 plus 1 element + matrixArray256_2 = _mm256_loadu_si256((__m256i *)&a[((tag_m_32x+10)*3 + 2)]); // Load 5 rows with n=3 plus 1 element matrixArray256_3 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx01_1st, matrixArray256_1); // Select the first 2 elements for each row matrixArray256_4 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx01_2nd, matrixArray256_2); // Select the first 2 elements for each row @@ -423,8 +434,8 @@ static int sbgemv_kernel_32x3(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, if (tail_num > 10) { unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-((tail_num-10-1)*3+1))); __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); - matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element - matrixArray256_1 = _mm256_loadu_si256(&a[((tag_m_32x+5)*3 + 1)]); // Load 5 rows with n=3 plus 1 element + matrixArray256_0 = _mm256_loadu_si256((__m256i *)&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element + matrixArray256_1 = _mm256_loadu_si256((__m256i *)&a[((tag_m_32x+5)*3 + 1)]); // Load 5 rows with n=3 plus 1 element matrixArray256_2 = _mm256_maskz_loadu_epi16(tail_mask, &a[((tag_m_32x+10)*3 + 2)]); // Load m-tag_m_32x-10 rows matrixArray256_3 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx01_1st, matrixArray256_1); // Select the first 2 elements for each row @@ -439,7 +450,7 @@ static int sbgemv_kernel_32x3(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, } else if (tail_num > 5) { unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-((tail_num-5-1)*3+2))); __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); - matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element + matrixArray256_0 = _mm256_loadu_si256((__m256i *)&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element matrixArray256_1 = _mm256_maskz_loadu_epi16(tail_mask, &a[((tag_m_32x+5)*3+1)]); // Load m-tag_m_32x-5 rows matrixArray256_2 = _mm256_setzero_si256(); @@ -499,7 +510,9 @@ static int sbgemv_kernel_16x4(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i M512_EPI32_1 = _mm512_set1_epi32(1); @@ -591,7 +604,9 @@ static int sbgemv_kernel_30x5(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512 result_0, result_1; @@ -782,7 +797,9 @@ static int sbgemv_kernel_16x6(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i M512_EPI32_1 = _mm512_set1_epi32(1); @@ -866,9 +883,9 @@ static int sbgemv_kernel_16x6(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, result256_0 = _mm256_setzero_ps(); - matrixArray_0 = _mm256_loadu_si256(&a[(tag_m_16x)*6]); // Load 2 rows with n=6 plus 4 element - matrixArray_1 = _mm256_loadu_si256(&a[((tag_m_16x+2)*6 + 4)]); // Load 2 rows with n=6 plus 4 element - matrixArray_2 = _mm256_loadu_si256(&a[((tag_m_16x+5)*6 + 2)]); // Load 2 rows with n=6 plus 4 element + matrixArray_0 = _mm256_loadu_si256((__m256i *)&a[(tag_m_16x)*6]); // Load 2 rows with n=6 plus 4 element + matrixArray_1 = _mm256_loadu_si256((__m256i *)&a[((tag_m_16x+2)*6 + 4)]); // Load 2 rows with n=6 plus 4 element + matrixArray_2 = _mm256_loadu_si256((__m256i *)&a[((tag_m_16x+5)*6 + 2)]); // Load 2 rows with n=6 plus 4 element // Process the 0|1 elements // Select the 0|1 elements for each row @@ -957,7 +974,9 @@ static int sbgemv_kernel_16x7(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i M512_EPI32_2 = _mm512_set1_epi32(2); @@ -1110,7 +1129,7 @@ static int sbgemv_kernel_16x8(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, { BLASLONG tag_m_16x = m & (~15); - __m128i x128 = _mm_loadu_si128(x); // |x0|x1|x2|x3|x4|x5|x6|x7| + __m128i x128 = _mm_loadu_si128((__m128i *)x); // |x0|x1|x2|x3|x4|x5|x6|x7| if (tag_m_16x > 0) { __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3; @@ -1122,7 +1141,9 @@ static int sbgemv_kernel_16x8(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i M512_EPI32_2 = _mm512_set1_epi32(2); @@ -1214,7 +1235,7 @@ static int sbgemv_kernel_16x8(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, __m128 result128, tmp128; for (BLASLONG i = tag_m_16x; i < m; i++) { result128 = _mm_setzero_ps(); - matrixArray128 = _mm_loadu_si128(&a[(i)*8]); // Load 1 rows with n=8 + matrixArray128 = _mm_loadu_si128((__m128i *)&a[(i)*8]); // Load 1 rows with n=8 result128 = _mm_dpbf16_ps(result128, (__m128bh) matrixArray128, (__m128bh) x128); tmp128 = _mm_shuffle_ps(result128, result128, 14); result128 = _mm_add_ps(result128, tmp128); @@ -1258,7 +1279,7 @@ static int sbgemv_kernel_14x9(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, unsigned char x_load_mask_value = (((unsigned char)0xff) >> 7); __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value); - __m128i x128_0 = _mm_loadu_si128(x); // |x0|x1|x2|x3|x4|x5|x6|x7| + __m128i x128_0 = _mm_loadu_si128((__m128i *)x); // |x0|x1|x2|x3|x4|x5|x6|x7| __m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8)); // |x8|0 |0 | 0| 0| 0| 0| 0| if (tag_m_14x > 0) { @@ -1271,7 +1292,9 @@ static int sbgemv_kernel_14x9(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m256i M256_EPI16_2 = _mm256_set1_epi16(2); @@ -1390,7 +1413,7 @@ static int sbgemv_kernel_12x10(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x unsigned char x_load_mask_value = (((unsigned char)0xf) >> 3); __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value); - __m128i x128_0 = _mm_loadu_si128(x); // |x0|x1|x2|x3|x4|x5|x6|x7| + __m128i x128_0 = _mm_loadu_si128((__m128i *)x); // |x0|x1|x2|x3|x4|x5|x6|x7| __m128i x128_1 = _mm_maskz_loadu_epi32(x_load_mask, (x+8)); // |x8|x9|0 | 0| 0| 0| 0| 0| if (tag_m_12x > 0) { @@ -1403,7 +1426,9 @@ static int sbgemv_kernel_12x10(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m256i M256_EPI32_1 = _mm256_set1_epi32(1); @@ -1522,7 +1547,7 @@ static int sbgemv_kernel_15x11(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x unsigned char x_load_mask_value = (((unsigned char)0xff) >> 5); __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value); - __m128i x128_0 = _mm_loadu_si128(x); // |x0|x1| x2|x3|x4|x5|x6|x7| + __m128i x128_0 = _mm_loadu_si128((__m128i *)x); // |x0|x1| x2|x3|x4|x5|x6|x7| __m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8)); // |x8|x9|x10| 0| 0| 0| 0| 0| if (tag_m_15x > 0) { @@ -1535,7 +1560,9 @@ static int sbgemv_kernel_15x11(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i idx_stage1_base_0, idx_stage1_base_1, idx_stage1_base_2, idx_stage1_base_3, idx_stage1_base_4, idx_stage1_base_5; @@ -1690,7 +1717,7 @@ static int sbgemv_kernel_15x12(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x unsigned char x_load_mask_value = (((unsigned char)0xff) >> 4); __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value); - __m128i x128_0 = _mm_loadu_si128(x); // |x0|x1| x2| x3|x4|x5|x6|x7| + __m128i x128_0 = _mm_loadu_si128((__m128i *)x); // |x0|x1| x2| x3|x4|x5|x6|x7| __m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8)); // |x8|x9|x10|x11| 0| 0| 0| 0| if (tag_m_15x > 0) { @@ -1703,7 +1730,9 @@ static int sbgemv_kernel_15x12(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i idx_stage1_base_0, idx_stage1_base_1, idx_stage1_base_2, idx_stage1_base_3, idx_stage1_base_4, idx_stage1_base_5; @@ -1873,16 +1902,15 @@ static int sbgemv_kernel_16x13(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i M512_EPI32_4 = _mm512_set1_epi32(4); __m512i idx_base_0 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0); __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_4); - unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 6); - __mmask32 load_mask = *((__mmask32*) &load_mask_value); - // Prepare X with 2-step interleave way xArray_0 = _mm512_inserti32x8(_mm512_castsi256_si512(x256), x256, 0x1); BF16_INTERLEAVE_1x32(xArray) @@ -2045,7 +2073,9 @@ static int sbgemv_kernel_16x14(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i M512_EPI32_4 = _mm512_set1_epi32(4); @@ -2207,16 +2237,15 @@ static int sbgemv_kernel_16x15(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i M512_EPI32_4 = _mm512_set1_epi32(4); __m512i idx_base_0 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0); __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_4); - unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 2); - __mmask32 load_mask = *((__mmask32*) &load_mask_value); - // Prepare X with 2-step interleave way xArray_0 = _mm512_inserti32x8(_mm512_castsi256_si512(x256), x256, 0x1); BF16_INTERLEAVE_1x32(xArray) @@ -2364,7 +2393,7 @@ static int sbgemv_kernel_16x16(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x { BLASLONG tag_m_16x = m & (~15); - __m256i x256 = _mm256_loadu_si256(x); // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|x13|x14|x15| + __m256i x256 = _mm256_loadu_si256((__m256i *)x); // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|x13|x14|x15| if (tag_m_16x > 0) { __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7, \ @@ -2377,7 +2406,9 @@ static int sbgemv_kernel_16x16(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i M512_EPI32_4 = _mm512_set1_epi32(4); @@ -2484,7 +2515,7 @@ static int sbgemv_kernel_16x16(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x __m128 accum128, tmp128; for (BLASLONG i = tag_m_16x; i < m; i++) { accum256 = _mm256_setzero_ps(); - matrixArray256 = _mm256_loadu_si256(&a[(i)*16]); // Load 1 rows with n=16 + matrixArray256 = _mm256_loadu_si256((__m256i *)&a[(i)*16]); // Load 1 rows with n=16 accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) matrixArray256, (__m256bh) x256); accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1)); tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e); @@ -2535,7 +2566,9 @@ static int sbgemv_kernel_8x16p_lda(BLASLONG m, BLASLONG n, float alpha, bfloat16 __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7, \ @@ -2647,8 +2680,6 @@ static int sbgemv_kernel_1x128_lda_direct(BLASLONG m, BLASLONG n, float alpha, b BLASLONG tag_n_32x = n & (~31); BLASLONG tag_n_128x = n & (~127); - __m512 accum512_0, accum512_1, accum512_2, accum512_3, accum512_4, accum512_5, accum512_6, accum512_7, \ - accum512_8, accum512_9, accum512_10, accum512_11, accum512_12, accum512_13, accum512_14, accum512_15; __m512 accum512_bridge[8]; __m512 accum512_t_0, accum512_t_1, accum512_t_2, accum512_t_3; __m256 accum256_0; @@ -2658,7 +2689,9 @@ static int sbgemv_kernel_1x128_lda_direct(BLASLONG m, BLASLONG n, float alpha, b __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3; @@ -2825,7 +2858,9 @@ static int sbgemv_kernel_8x32_lda_direct(BLASLONG m, BLASLONG n, float alpha, bf __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7; @@ -2961,7 +2996,9 @@ static int sbgemv_kernel_8x16m_lda(BLASLONG m, BLASLONG n, float alpha, bfloat16 __m512 ALPHAVECTOR = _mm512_castps256_ps512(_mm256_set1_ps(alpha)); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_castps256_ps512(_mm256_set1_ps(beta)); +#endif #endif __m256 accum256_0, accum256_1, accum256_2, accum256_3, accum256_4, accum256_5, accum256_6, accum256_7, \ @@ -3012,7 +3049,7 @@ static int sbgemv_kernel_8x16m_lda(BLASLONG m, BLASLONG n, float alpha, bfloat16 __m128 accum128, tmp128; for (BLASLONG i = tag_m_8x; i < m; i++) { accum256_0 = _mm256_setzero_ps(); - matrixArray_0 = _mm256_loadu_si256(&a[(i)*lda]); // Load 1 rows with n=16 + matrixArray_0 = _mm256_loadu_si256((__m256i *)&a[(i)*lda]); // Load 1 rows with n=16 accum256_0 = _mm256_dpbf16_ps(accum256_0, (__m256bh) matrixArray_0, (__m256bh) xArray256); accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1)); tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e); diff --git a/kernel/x86_64/sgemm_beta_skylakex.c b/kernel/x86_64/sgemm_beta_skylakex.c index 1c29c1168..6217acf48 100644 --- a/kernel/x86_64/sgemm_beta_skylakex.c +++ b/kernel/x86_64/sgemm_beta_skylakex.c @@ -41,7 +41,7 @@ #include int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, - FLOAT *dummy2, BLASLONG dummy3, FLOAT *dummy4, BLASLONG dummy5, + IFLOAT *dummy2, BLASLONG dummy3, IFLOAT *dummy4, BLASLONG dummy5, FLOAT *c, BLASLONG ldc){ BLASLONG i, j; diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index 90865c4b3..e0778006f 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -115,6 +115,8 @@ static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT #endif +#ifndef HAVE_SGEMV_N_SKYLAKE_KERNEL + #ifndef HAVE_KERNEL_4x2 static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); @@ -170,6 +172,7 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT } +#endif #endif #ifndef HAVE_KERNEL_4x1 diff --git a/kernel/x86_64/zasum_microk_skylakex-2.c b/kernel/x86_64/zasum_microk_skylakex-2.c index b44c53801..e257a5456 100644 --- a/kernel/x86_64/zasum_microk_skylakex-2.c +++ b/kernel/x86_64/zasum_microk_skylakex-2.c @@ -16,7 +16,7 @@ static FLOAT zasum_kernel(BLASLONG n, FLOAT *x) if (n2 < 32) { __m128d accum_10, accum_11, accum_12, accum_13; - __m128d abs_mask1; + __m128d abs_mask1 = abs_mask1; accum_10 = _mm_setzero_pd(); accum_11 = _mm_setzero_pd(); diff --git a/lapack-netlib/SRC/clarrv.f b/lapack-netlib/SRC/clarrv.f index a45f55ac3..26a9febc8 100644 --- a/lapack-netlib/SRC/clarrv.f +++ b/lapack-netlib/SRC/clarrv.f @@ -351,7 +351,7 @@ * * Quick return if possible * - IF( N.LE.0 ) THEN + IF( (N.LE.0) .OR. (M.LE.0) ) THEN RETURN END IF * diff --git a/lapack-netlib/SRC/dlarrv.f b/lapack-netlib/SRC/dlarrv.f index 4a59a2bbf..a1c6e9c9d 100644 --- a/lapack-netlib/SRC/dlarrv.f +++ b/lapack-netlib/SRC/dlarrv.f @@ -353,7 +353,7 @@ * * Quick return if possible * - IF( N.LE.0 ) THEN + IF( (N.LE.0).OR.(M.LE.0) ) THEN RETURN END IF * diff --git a/lapack-netlib/SRC/slarrv.f b/lapack-netlib/SRC/slarrv.f index 04519fde8..9448b2fd9 100644 --- a/lapack-netlib/SRC/slarrv.f +++ b/lapack-netlib/SRC/slarrv.f @@ -353,7 +353,7 @@ * * Quick return if possible * - IF( N.LE.0 ) THEN + IF( (N.LE.0).OR.(M.LE.0) ) THEN RETURN END IF * diff --git a/lapack-netlib/SRC/zlarrv.f b/lapack-netlib/SRC/zlarrv.f index 23976dbef..8d10e3c2e 100644 --- a/lapack-netlib/SRC/zlarrv.f +++ b/lapack-netlib/SRC/zlarrv.f @@ -351,7 +351,7 @@ * * Quick return if possible * - IF( N.LE.0 ) THEN + IF( (N.LE.0).OR.(M.LE.0) ) THEN RETURN END IF * diff --git a/param.h b/param.h index 07397a66e..038233c19 100644 --- a/param.h +++ b/param.h @@ -1771,6 +1771,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #define USE_SGEMM_KERNEL_DIRECT 1 +#undef SBGEMM_DEFAULT_UNROLL_N +#undef SBGEMM_DEFAULT_UNROLL_M +#undef SBGEMM_DEFAULT_P +#undef SBGEMM_DEFAULT_R +#undef SBGEMM_DEFAULT_Q +#define SBGEMM_DEFAULT_UNROLL_N 4 +#define SBGEMM_DEFAULT_UNROLL_M 16 +#define SBGEMM_DEFAULT_P 384 +#define SBGEMM_DEFAULT_Q 768 +#define SBGEMM_DEFAULT_R sbgemm_r + #ifdef ARCH_X86 #define SGEMM_DEFAULT_UNROLL_M 4 @@ -2454,13 +2465,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 8 -#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) -#define DGEMM_DEFAULT_UNROLL_M 16 -#define DGEMM_DEFAULT_UNROLL_N 4 -#else #define DGEMM_DEFAULT_UNROLL_M 8 #define DGEMM_DEFAULT_UNROLL_N 8 -#endif #define CGEMM_DEFAULT_UNROLL_M 8 #define CGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_M 8