Merge branch 'xianyi:develop' into azure-mingw-make
This commit is contained in:
commit
f54fa15cdd
27
.travis.yml
27
.travis.yml
|
@ -1,33 +1,38 @@
|
|||
# XXX: Precise is already deprecated, new default is Trusty.
|
||||
# https://blog.travis-ci.com/2017-07-11-trusty-as-default-linux-is-coming
|
||||
dist: precise
|
||||
dist: focal
|
||||
sudo: true
|
||||
language: c
|
||||
|
||||
matrix:
|
||||
include:
|
||||
- &test-ubuntu
|
||||
os: linux
|
||||
# os: linux
|
||||
compiler: gcc
|
||||
addons:
|
||||
apt:
|
||||
packages:
|
||||
- gfortran
|
||||
# before_script: &common-before
|
||||
# - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
|
||||
# script:
|
||||
# - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
# - make -C test $COMMON_FLAGS $BTYPE
|
||||
# - make -C ctest $COMMON_FLAGS $BTYPE
|
||||
# - make -C utest $COMMON_FLAGS $BTYPE
|
||||
# env:
|
||||
# - TARGET_BOX=LINUX64
|
||||
# - BTYPE="BINARY=64"
|
||||
#
|
||||
# - <<: *test-ubuntu
|
||||
os: linux-ppc64le
|
||||
before_script: &common-before
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32"
|
||||
script:
|
||||
- make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
- make -C test $COMMON_FLAGS $BTYPE
|
||||
- make -C ctest $COMMON_FLAGS $BTYPE
|
||||
- make -C utest $COMMON_FLAGS $BTYPE
|
||||
env:
|
||||
- TARGET_BOX=LINUX64
|
||||
- BTYPE="BINARY=64"
|
||||
|
||||
- <<: *test-ubuntu
|
||||
os: linux-ppc64le
|
||||
before_script:
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32"
|
||||
env:
|
||||
# for matrix annotation only
|
||||
- TARGET_BOX=PPC64LE_LINUX
|
||||
|
|
|
@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
|
|||
project(OpenBLAS C ASM)
|
||||
set(OpenBLAS_MAJOR_VERSION 0)
|
||||
set(OpenBLAS_MINOR_VERSION 3)
|
||||
set(OpenBLAS_PATCH_VERSION 17.dev)
|
||||
set(OpenBLAS_PATCH_VERSION 18.dev)
|
||||
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
|
||||
|
||||
# Adhere to GNU filesystem layout conventions
|
||||
|
@ -132,7 +132,7 @@ endif ()
|
|||
|
||||
if (BUILD_BFLOAT16)
|
||||
message(STATUS "Building Half Precision")
|
||||
list(APPEND FLOAT_TYPES "BFLOAT16") # defines nothing
|
||||
# list(APPEND FLOAT_TYPES "BFLOAT16") # defines nothing
|
||||
endif ()
|
||||
|
||||
if (NOT DEFINED CORE OR "${CORE}" STREQUAL "UNKNOWN")
|
||||
|
|
|
@ -1,4 +1,47 @@
|
|||
OpenBLAS ChangeLog
|
||||
====================================================================
|
||||
Version 0.3.18
|
||||
02-Oct-2021
|
||||
|
||||
general:
|
||||
- when the build-time number of preconfigured threads is exceeded
|
||||
at runtime (typically by an external program calling BLAS functions
|
||||
from a larger number of threads in parallel), OpenBLAS will now
|
||||
allocate an auxiliary control structure for up to 512 additional
|
||||
threads instead of aborting
|
||||
- added support for Loongson's LoongArch64 cpu architecture
|
||||
- fixed building OpenBLAS with CMAKE and -DBUILD_BFLOAT16=ON
|
||||
- added support for building OpenBLAS as a CMAKE subproject
|
||||
- added support for building for Windows/ARM64 targets with clang
|
||||
- improved support for building with the IBM xlf compiler
|
||||
- imported Reference-LAPACK PR 625 (out-of-bounds reads in ?LARRV)
|
||||
- imported Reference-LAPACK PR 597 for testsuite compatibility with
|
||||
LLVM's libomp
|
||||
|
||||
x86_64:
|
||||
- added SkylakeX S/DGEMM kernels for small problem sizes (M*N*K<=1000000)
|
||||
- added optimized SBGEMM for Intel Cooper Lake
|
||||
- reinstated the performance patch for AVX512 SGEMV_T with a proper fix
|
||||
- added a workaround for a gcc11 tree-vectorizer bug that caused spurious
|
||||
failures in the test programs for complex BLAS3 when compiling at -O3
|
||||
(the default for cmake "release" builds)
|
||||
- added support for runtime cpu count detection under Haiku OS
|
||||
- worked around a long-standing miscompilation issue of the Haswell DGEMV_T
|
||||
kernel with gcc that could produce NaN output in some corner cases
|
||||
|
||||
POWER:
|
||||
- improved performance of DASUM on POWER10
|
||||
|
||||
ARMV8:
|
||||
- fixed crashes (use of reserved register x18) on Apple M1 under OSX
|
||||
- fixed building with gcc releases earlier than 5.1
|
||||
|
||||
MIPS:
|
||||
- fixed building under BSD
|
||||
|
||||
MIPS64:
|
||||
- fixed building under BSD
|
||||
|
||||
====================================================================
|
||||
Version 0.3.17
|
||||
15-Jul-2021
|
||||
|
|
2
Makefile
2
Makefile
|
@ -269,7 +269,7 @@ prof_lapack : lapack_prebuild
|
|||
lapack_prebuild :
|
||||
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
||||
-@echo "FC = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "override FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "FFLAGS_DRV = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
|
|
|
@ -12,9 +12,13 @@ endif
|
|||
ifeq ($(CORE), POWER10)
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
|
||||
ifeq ($(F_COMPILER), IBM)
|
||||
FCOMMON_OPT += -O2 -qrecur -qnosave
|
||||
else
|
||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), POWER9)
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
|
@ -33,7 +37,11 @@ else
|
|||
CCOMMON_OPT += -fast -Mvect=simd -Mcache_align
|
||||
endif
|
||||
ifneq ($(F_COMPILER), PGI)
|
||||
ifeq ($(F_COMPILER), IBM)
|
||||
FCOMMON_OPT += -O2 -qrecur -qnosave
|
||||
else
|
||||
FCOMMON_OPT += -O2 -frecursive -fno-fast-math
|
||||
endif
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
ifneq ($(GCCVERSIONGT4), 1)
|
||||
$(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended)
|
||||
|
@ -57,7 +65,11 @@ CCOMMON_OPT += -fast -Mvect=simd -Mcache_align
|
|||
endif
|
||||
ifneq ($(F_COMPILER), PGI)
|
||||
ifeq ($(OSNAME), AIX)
|
||||
ifeq ($(F_COMPILER), IBM)
|
||||
FCOMMON_OPT += -O2 -qrecur -qnosave
|
||||
else
|
||||
FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math
|
||||
endif
|
||||
else
|
||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math
|
||||
endif
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
#
|
||||
|
||||
# This library's version
|
||||
VERSION = 0.3.17.dev
|
||||
VERSION = 0.3.18.dev
|
||||
|
||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||
|
|
|
@ -16,6 +16,8 @@ else
|
|||
HOSTARCH = $(ARCH)
|
||||
endif
|
||||
|
||||
HAVE_GAS := $(shell as -v < /dev/null 2>&1 | grep GNU 2>&1 >/dev/null)
|
||||
|
||||
# Catch conflicting usage of ARCH in some BSD environments
|
||||
ifeq ($(ARCH), amd64)
|
||||
override ARCH=x86_64
|
||||
|
@ -33,6 +35,10 @@ else ifeq ($(ARCH), armv7)
|
|||
override ARCH=arm
|
||||
else ifeq ($(ARCH), aarch64)
|
||||
override ARCH=arm64
|
||||
else ifeq ($(ARCH), mipsel)
|
||||
override ARCH=mips
|
||||
else ifeq ($(ARCH), mips64el)
|
||||
override ARCH=mips64
|
||||
else ifeq ($(ARCH), zarch)
|
||||
override ARCH=zarch
|
||||
endif
|
||||
|
@ -303,7 +309,7 @@ else
|
|||
SMP = 1
|
||||
endif
|
||||
else
|
||||
ifeq ($(NUM_THREAD), 1)
|
||||
ifeq ($(NUM_THREADS), 1)
|
||||
SMP =
|
||||
else
|
||||
SMP = 1
|
||||
|
|
|
@ -128,6 +128,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
|
|||
- **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64.
|
||||
- **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64.
|
||||
- **Intel Skylake-X**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64.
|
||||
- **Intel Cooper Lake**: as Skylake-X with improved BFLOAT16 support.
|
||||
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
|
||||
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
|
||||
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
|
||||
|
@ -153,6 +154,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
|
|||
|
||||
- **ARMv8**: Basic ARMV8 with small caches, optimized Level-3 and Level-2 BLAS
|
||||
- **Cortex-A53**: same as ARMV8 (different cpu specifications)
|
||||
- **Cortex-A55**: same as ARMV8 (different cpu specifications)
|
||||
- **Cortex A57**: Optimized Level-3 and Level-2 functions
|
||||
- **Cortex A72**: same as A57 ( different cpu specifications)
|
||||
- **Cortex A73**: same as A57 (different cpu specifications)
|
||||
|
@ -178,10 +180,11 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
|
|||
|
||||
#### RISC-V
|
||||
|
||||
- **C910V**: Optimized Leve-3 BLAS (real) and Level-1,2 by RISC-V Vector extension 0.7.1.
|
||||
- **C910V**: Optimized Level-3 BLAS (real) and Level-1,2 by RISC-V Vector extension 0.7.1.
|
||||
```sh
|
||||
make HOSTCC=gcc TARGET=C910V CC=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran
|
||||
```
|
||||
(also known to work on C906)
|
||||
|
||||
### Support for multiple targets in a single library
|
||||
|
||||
|
|
|
@ -19,7 +19,7 @@ jobs:
|
|||
# of gcc / glibc
|
||||
- job: manylinux1_gcc
|
||||
pool:
|
||||
vmImage: 'ubuntu-16.04'
|
||||
vmImage: 'ubuntu-latest'
|
||||
steps:
|
||||
- script: |
|
||||
echo "FROM quay.io/pypa/manylinux1_x86_64
|
||||
|
@ -35,7 +35,7 @@ jobs:
|
|||
displayName: Run manylinux1 docker build
|
||||
- job: Intel_SDE_skx
|
||||
pool:
|
||||
vmImage: 'ubuntu-16.04'
|
||||
vmImage: 'ubuntu-latest'
|
||||
steps:
|
||||
- script: |
|
||||
# at the time of writing the available Azure Ubuntu vm image
|
||||
|
@ -213,8 +213,9 @@ jobs:
|
|||
vmImage: 'ubuntu-latest'
|
||||
steps:
|
||||
- script: |
|
||||
wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \
|
||||
&& echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1
|
||||
wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.13.1/alpine-chroot-install \
|
||||
&& echo '7c7e3fa378e69aecc7f5f01bbc759e5f0a9d9b74 alpine-chroot-install' | sha1sum -c \
|
||||
|| exit 1
|
||||
alpine() { /alpine/enter-chroot -u "$USER" "$@"; }
|
||||
sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo'
|
||||
alpine make DYNAMIC_ARCH=1 BINARY=64
|
||||
|
|
|
@ -104,7 +104,7 @@ endif ()
|
|||
|
||||
if (${F_COMPILER} STREQUAL "IBM")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_IBM")
|
||||
# FCOMMON_OPT += -qarch=440
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -qrecur")
|
||||
if (BINARY64)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -q64")
|
||||
if (INTERFACE64)
|
||||
|
|
|
@ -134,6 +134,8 @@ if (BUILD_BFLOAT16)
|
|||
set(SHSWAPKERNEL ../arm/swap.c)
|
||||
set(TOBF16KERNEL ../x86_64/tobf16.c)
|
||||
set(BF16TOKERNEL ../x86_64/bf16to.c)
|
||||
set(SBGEMVNKERNEL ../x86_64/sbgemv_n.c)
|
||||
set(SBGEMVTKERNEL ../x86_64/sbgemv_t.c)
|
||||
endif ()
|
||||
endmacro ()
|
||||
|
||||
|
|
|
@ -469,6 +469,9 @@ endif()
|
|||
if (BUILD_COMPLEX16)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX16")
|
||||
endif()
|
||||
if (BUILD_BFLOAT16)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_BFLOAT16")
|
||||
endif()
|
||||
if(NOT MSVC)
|
||||
set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CCOMMON_OPT}")
|
||||
endif()
|
||||
|
|
|
@ -26,10 +26,12 @@
|
|||
*****************************************************************************/
|
||||
|
||||
#include <string.h>
|
||||
#ifdef OS_DARWIN
|
||||
#ifdef __APPLE__
|
||||
#include <sys/sysctl.h>
|
||||
int32_t value;
|
||||
size_t length=sizeof(value);
|
||||
int64_t value64;
|
||||
size_t length64=sizeof(value64);
|
||||
#endif
|
||||
|
||||
#define CPU_UNKNOWN 0
|
||||
|
@ -212,9 +214,9 @@ int detect(void)
|
|||
|
||||
}
|
||||
#else
|
||||
#ifdef DARWIN
|
||||
#ifdef __APPLE__
|
||||
sysctlbyname("hw.cpufamily",&value,&length,NULL,0);
|
||||
if (value ==131287967) return CPU_VORTEX;
|
||||
if (value ==131287967|| value == 458787763 ) return CPU_VORTEX;
|
||||
#endif
|
||||
return CPU_ARMV8;
|
||||
#endif
|
||||
|
@ -265,7 +267,7 @@ int n=0;
|
|||
|
||||
printf("#define NUM_CORES %d\n",n);
|
||||
#endif
|
||||
#ifdef DARWIN
|
||||
#ifdef __APPLE__
|
||||
sysctlbyname("hw.physicalcpu_max",&value,&length,NULL,0);
|
||||
printf("#define NUM_CORES %d\n",value);
|
||||
#endif
|
||||
|
@ -420,17 +422,19 @@ void get_cpuconfig(void)
|
|||
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
|
||||
printf("#define DTB_SIZE 4096 \n");
|
||||
break;
|
||||
#ifdef DARWIN
|
||||
#ifdef __APPLE__
|
||||
case CPU_VORTEX:
|
||||
printf("#define VORTEX \n");
|
||||
sysctlbyname("hw.l1icachesize",&value,&length,NULL,0);
|
||||
printf("#define L1_CODE_SIZE %d \n",value);
|
||||
sysctlbyname("hw.cachelinesize",&value,&length,NULL,0);
|
||||
printf("#define L1_CODE_LINESIZE %d \n",value);
|
||||
sysctlbyname("hw.l1dcachesize",&value,&length,NULL,0);
|
||||
printf("#define L1_DATA_SIZE %d \n",value);
|
||||
sysctlbyname("hw.l2dcachesize",&value,&length,NULL,0);
|
||||
printf("#define L2_SIZE %d \n",value);
|
||||
sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0);
|
||||
printf("#define L1_CODE_SIZE %lld \n",value64);
|
||||
sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0);
|
||||
printf("#define L1_CODE_LINESIZE %lld \n",value64);
|
||||
sysctlbyname("hw.l1dcachesize",&value64,&length64,NULL,0);
|
||||
printf("#define L1_DATA_SIZE %lld \n",value64);
|
||||
sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0);
|
||||
printf("#define L2_SIZE %lld \n",value64);
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
|
||||
printf("#define DTB_SIZE 4096 \n");
|
||||
break;
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -81,6 +81,7 @@ foreach (float_type ${FLOAT_TYPES})
|
|||
GenerateNamedObjects("gbmv_thread.c" "TRANSA" "gbmv_thread_t" false "" "" false ${float_type})
|
||||
endif ()
|
||||
|
||||
# special defines for complex
|
||||
if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX")
|
||||
|
||||
foreach (u_source ${U_SOURCES})
|
||||
|
@ -197,6 +198,13 @@ foreach (float_type ${FLOAT_TYPES})
|
|||
endif ()
|
||||
endforeach ()
|
||||
|
||||
if (BUILD_BFLOAT16)
|
||||
if (USE_THREAD)
|
||||
GenerateNamedObjects("sbgemv_thread.c" "" "gemv_thread_n" false "" "" false "BFLOAT16")
|
||||
GenerateNamedObjects("sbgemv_thread.c" "TRANSA" "gemv_thread_t" false "" "" false "BFLOAT16")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if ( BUILD_COMPLEX AND NOT BUILD_SINGLE)
|
||||
if (USE_THREAD)
|
||||
GenerateNamedObjects("gemv_thread.c" "" "gemv_thread_n" false "" "" false "SINGLE")
|
||||
|
|
|
@ -12,6 +12,12 @@ foreach (GEMM_DEFINE ${GEMM_DEFINES})
|
|||
if (USE_THREAD AND NOT USE_SIMPLE_THREADED_LEVEL3)
|
||||
GenerateNamedObjects("gemm.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm_thread_${GEMM_DEFINE_LC}" 0)
|
||||
endif ()
|
||||
if (BUILD_BFLOAT16)
|
||||
GenerateNamedObjects("gemm.c" "${GEMM_DEFINE}" "gemm_${GEMM_DEFINE_LC}" 0 "" "" false "BFLOAT16")
|
||||
if (USE_THREAD AND NOT USE_SIMPLE_THREADED_LEVEL3)
|
||||
GenerateNamedObjects("gemm.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm_thread_${GEMM_DEFINE_LC}" 0 "" "" false "BFLOAT16")
|
||||
endif ()
|
||||
endif ()
|
||||
endforeach ()
|
||||
|
||||
if ( BUILD_COMPLEX16 AND NOT BUILD_DOUBLE)
|
||||
|
|
|
@ -6,10 +6,6 @@ extern gotoblas_t gotoblas_POWER8;
|
|||
#if (!defined __GNUC__) || ( __GNUC__ >= 6)
|
||||
extern gotoblas_t gotoblas_POWER9;
|
||||
#endif
|
||||
//#if (!defined __GNUC__) || ( __GNUC__ >= 11) \
|
||||
// || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2)
|
||||
//#define HAVE_P10_SUPPORT 1
|
||||
//#endif
|
||||
#ifdef HAVE_P10_SUPPORT
|
||||
extern gotoblas_t gotoblas_POWER10;
|
||||
#endif
|
||||
|
|
|
@ -2695,7 +2695,7 @@ static volatile struct {
|
|||
|
||||
} memory[NUM_BUFFERS];
|
||||
|
||||
static volatile struct newmemstruct
|
||||
struct newmemstruct
|
||||
{
|
||||
BLASULONG lock;
|
||||
void *addr;
|
||||
|
|
|
@ -524,6 +524,9 @@ void blas_set_parameter(void){
|
|||
xgemm_p = ((xgemm_p + XGEMM_UNROLL_M - 1)/XGEMM_UNROLL_M) * XGEMM_UNROLL_M;
|
||||
#endif
|
||||
|
||||
#ifdef BUILD_BFLOAT16
|
||||
sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q * 4)) - 15) & ~15;
|
||||
#endif
|
||||
sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15;
|
||||
dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15;
|
||||
cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15;
|
||||
|
@ -629,7 +632,9 @@ void blas_set_parameter(void){
|
|||
xgemm_p = 16 * (size + 1);
|
||||
#endif
|
||||
|
||||
#ifdef BUILD_BFLOAT16
|
||||
sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q * 4)) - 15) & ~15;
|
||||
#endif
|
||||
sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15;
|
||||
dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15;
|
||||
cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15;
|
||||
|
|
87
getarch.c
87
getarch.c
|
@ -313,6 +313,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define FORCE
|
||||
#define FORCE_INTEL
|
||||
#define ARCHITECTURE "X86"
|
||||
#ifdef NO_AVX
|
||||
#define SUBARCHITECTURE "NEHALEM"
|
||||
#define ARCHCONFIG "-DNEHALEM " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2"
|
||||
#define LIBNAME "nehalem"
|
||||
#define CORENAME "NEHALEM"
|
||||
#else
|
||||
#define SUBARCHITECTURE "SANDYBRIDGE"
|
||||
#define ARCHCONFIG "-DSANDYBRIDGE " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
|
@ -322,12 +332,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define LIBNAME "sandybridge"
|
||||
#define CORENAME "SANDYBRIDGE"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_HASWELL
|
||||
#define FORCE
|
||||
#define FORCE_INTEL
|
||||
#define ARCHITECTURE "X86"
|
||||
#ifdef NO_AVX2
|
||||
#ifdef NO_AVX
|
||||
#define SUBARCHITECTURE "NEHALEM"
|
||||
#define ARCHCONFIG "-DNEHALEM " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2"
|
||||
#define LIBNAME "nehalem"
|
||||
#define CORENAME "NEHALEM"
|
||||
#else
|
||||
#define SUBARCHITECTURE "SANDYBRIDGE"
|
||||
#define ARCHCONFIG "-DSANDYBRIDGE " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
|
@ -336,6 +357,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX"
|
||||
#define LIBNAME "sandybridge"
|
||||
#define CORENAME "SANDYBRIDGE"
|
||||
#endif
|
||||
#else
|
||||
#define SUBARCHITECTURE "HASWELL"
|
||||
#define ARCHCONFIG "-DHASWELL " \
|
||||
|
@ -350,10 +372,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
|
||||
#ifdef FORCE_SKYLAKEX
|
||||
#ifdef NO_AVX512
|
||||
#define FORCE
|
||||
#define FORCE_INTEL
|
||||
#define ARCHITECTURE "X86"
|
||||
#ifdef NO_AVX512
|
||||
#ifdef NO_AVX2
|
||||
#ifdef NO_AVX
|
||||
#define SUBARCHITECTURE "NEHALEM"
|
||||
#define ARCHCONFIG "-DNEHALEM " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2"
|
||||
#define LIBNAME "nehalem"
|
||||
#define CORENAME "NEHALEM"
|
||||
#else
|
||||
#define SUBARCHITECTURE "SANDYBRIDGE"
|
||||
#define ARCHCONFIG "-DSANDYBRIDGE " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX"
|
||||
#define LIBNAME "sandybridge"
|
||||
#define CORENAME "SANDYBRIDGE"
|
||||
#endif
|
||||
#else
|
||||
#define SUBARCHITECTURE "HASWELL"
|
||||
#define ARCHCONFIG "-DHASWELL " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
|
@ -363,10 +406,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
"-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3"
|
||||
#define LIBNAME "haswell"
|
||||
#define CORENAME "HASWELL"
|
||||
#endif
|
||||
#else
|
||||
#define FORCE
|
||||
#define FORCE_INTEL
|
||||
#define ARCHITECTURE "X86"
|
||||
#define SUBARCHITECTURE "SKYLAKEX"
|
||||
#define ARCHCONFIG "-DSKYLAKEX " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
|
@ -380,10 +421,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
|
||||
#ifdef FORCE_COOPERLAKE
|
||||
#ifdef NO_AVX512
|
||||
#define FORCE
|
||||
#define FORCE_INTEL
|
||||
#define ARCHITECTURE "X86"
|
||||
#ifdef NO_AVX512
|
||||
#ifdef NO_AVX2
|
||||
#ifdef NO_AVX
|
||||
#define SUBARCHITECTURE "NEHALEM"
|
||||
#define ARCHCONFIG "-DNEHALEM " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2"
|
||||
#define LIBNAME "nehalem"
|
||||
#define CORENAME "NEHALEM"
|
||||
#else
|
||||
#define SUBARCHITECTURE "SANDYBRIDGE"
|
||||
#define ARCHCONFIG "-DSANDYBRIDGE " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX"
|
||||
#define LIBNAME "sandybridge"
|
||||
#define CORENAME "SANDYBRIDGE"
|
||||
#endif
|
||||
#else
|
||||
#define SUBARCHITECTURE "HASWELL"
|
||||
#define ARCHCONFIG "-DHASWELL " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
|
@ -393,10 +455,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
"-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3"
|
||||
#define LIBNAME "haswell"
|
||||
#define CORENAME "HASWELL"
|
||||
#endif
|
||||
#else
|
||||
#define FORCE
|
||||
#define FORCE_INTEL
|
||||
#define ARCHITECTURE "X86"
|
||||
#define SUBARCHITECTURE "COOPERLAKE"
|
||||
#define ARCHCONFIG "-DCOOPERLAKE " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
|
@ -564,6 +624,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define FORCE_INTEL
|
||||
#define ARCHITECTURE "X86"
|
||||
#ifdef NO_AVX2
|
||||
#ifdef NO_AVX
|
||||
#define SUBARCHITECTURE "NEHALEM"
|
||||
#define ARCHCONFIG "-DNEHALEM " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2"
|
||||
#define LIBNAME "nehalem"
|
||||
#define CORENAME "NEHALEM"
|
||||
#else
|
||||
#define SUBARCHITECTURE "SANDYBRIDGE"
|
||||
#define ARCHCONFIG "-DSANDYBRIDGE " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
|
@ -572,6 +642,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX"
|
||||
#define LIBNAME "sandybridge"
|
||||
#define CORENAME "SANDYBRIDGE"
|
||||
#endif
|
||||
#else
|
||||
#define SUBARCHITECTURE "ZEN"
|
||||
#define ARCHCONFIG "-DZEN " \
|
||||
|
|
|
@ -82,6 +82,7 @@ foreach (CBLAS_FLAG ${CBLAS_FLAGS})
|
|||
GenerateNamedObjects("${BLAS3_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${DISABLE_COMPLEX})
|
||||
GenerateNamedObjects("${BLAS3_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX})
|
||||
|
||||
GenerateNamedObjects("xerbla.c" "" "xerbla" ${CBLAS_FLAG} "" "" true)
|
||||
#sdsdot, dsdot
|
||||
if (BUILD_SINGLE OR BUILD_DOUBLE)
|
||||
GenerateNamedObjects("sdsdot.c" "" "sdsdot" ${CBLAS_FLAG} "" "" true "SINGLE")
|
||||
|
@ -104,6 +105,15 @@ endif ()
|
|||
GenerateNamedObjects("imax.c" "USE_ABS;USE_MIN" "i*amin" ${CBLAS_FLAG})
|
||||
GenerateNamedObjects("imax.c" "USE_MIN" "i*min" ${CBLAS_FLAG})
|
||||
|
||||
if (BUILD_BFLOAT16)
|
||||
GenerateNamedObjects("bf16dot.c" "" "sbdot" ${CBLAS_FLAG} "" "" true "BFLOAT16")
|
||||
GenerateNamedObjects("gemm.c" "" "sbgemm" ${CBLAS_FLAG} "" "" true "BFLOAT16")
|
||||
GenerateNamedObjects("sbgemv.c" "" "sbgemv" ${CBLAS_FLAG} "" "" true "BFLOAT16")
|
||||
GenerateNamedObjects("tobf16.c" "SINGLE_PREC" "sbstobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16")
|
||||
GenerateNamedObjects("tobf16.c" "DOUBLE_PREC" "sbdtobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16")
|
||||
GenerateNamedObjects("bf16to.c" "SINGLE_PREC" "sbf16tos" ${CBLAS_FLAG} "" "" true "BFLOAT16")
|
||||
GenerateNamedObjects("bf16to.c" "DOUBLE_PREC" "dbf16tod" ${CBLAS_FLAG} "" "" true "BFLOAT16")
|
||||
endif ()
|
||||
|
||||
# complex-specific sources
|
||||
foreach (float_type ${FLOAT_TYPES})
|
||||
|
|
|
@ -326,7 +326,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
|
|||
|
||||
PRINT_DEBUG_CNAME;
|
||||
|
||||
#if !defined(COMPLEX) && !defined(DOUBLE) && defined(USE_SGEMM_KERNEL_DIRECT)
|
||||
#if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && defined(USE_SGEMM_KERNEL_DIRECT)
|
||||
#ifdef DYNAMIC_ARCH
|
||||
if (support_avx512() )
|
||||
#endif
|
||||
|
|
|
@ -119,7 +119,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
|
|||
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLOAT *x, int incx, FLOAT *a, int lda) {
|
||||
|
||||
FLOAT *buffer;
|
||||
int trans, uplo;
|
||||
int uplo;
|
||||
blasint info;
|
||||
FLOAT * ALPHA = α
|
||||
FLOAT alpha_r = ALPHA[0];
|
||||
|
@ -130,7 +130,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLO
|
|||
|
||||
PRINT_DEBUG_CNAME;
|
||||
|
||||
trans = -1;
|
||||
uplo = -1;
|
||||
info = 0;
|
||||
|
||||
|
|
|
@ -91,6 +91,15 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
|||
GenerateNamedObjects("${KERNELDIR}/${DSDOTKERNEL}" "DSDOT" "d*dot_k" false "" "" false "SINGLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${DSDOTKERNEL}" "DSDOT" "dsdot_k" false "" "" false "SINGLE")
|
||||
|
||||
# sbdot
|
||||
if (BUILD_BFLOAT16)
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBDOTKERNEL}" "SBDOT" "dot_k" false "" "" false "BFLOAT16")
|
||||
GenerateNamedObjects("${KERNELDIR}/${BF16TOKERNEL}" "SINGLE" "f16tos_k" false "" "" false "BFLOAT16")
|
||||
GenerateNamedObjects("${KERNELDIR}/${BF16TOKERNEL}" "DOUBLE" "bf16tod_k" false "" "" false "DOUBLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${TOBF16KERNEL}" "SINGLE" "stobf16_k" false "" "" false "BFLOAT16")
|
||||
GenerateNamedObjects("${KERNELDIR}/${TOBF16KERNEL}" "DOUBLE" "dtobf16_k" false "" "" false "BFLOAT16")
|
||||
endif()
|
||||
|
||||
if ((BUILD_COMPLEX OR BUILD_DOUBLE) AND NOT BUILD_SINGLE)
|
||||
GenerateNamedObjects("${KERNELDIR}/${SAMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false "SINGLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SAMINKERNEL}" "USE_ABS;USE_MIN" "amin_k" false "" "" false "SINGLE")
|
||||
|
@ -149,9 +158,6 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
|||
GenerateNamedObjects("generic/ger.c" "" "ger_k" false "" "" "" 3)
|
||||
foreach (float_type ${FLOAT_TYPES})
|
||||
string(SUBSTRING ${float_type} 0 1 float_char)
|
||||
if (${float_type} STREQUAL "BFLOAT16")
|
||||
set (float_char "SB")
|
||||
endif ()
|
||||
if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX")
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GERUKERNEL}" "" "geru_k" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GERCKERNEL}" "CONJ" "gerc_k" false "" "" false ${float_type})
|
||||
|
@ -185,6 +191,10 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
|||
GenerateNamedObjects("${KERNELDIR}/${SGEMVNKERNEL}" "" "gemv_n" false "" "" false "SINGLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SGEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false "SINGLE")
|
||||
endif ()
|
||||
if (BUILD_BFLOAT16)
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMVNKERNEL}" "" "gemv_n" false "" "" false "BFLOAT16")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMVTKERNEL}" "" "gemv_t" false "" "" false "BFLOAT16")
|
||||
endif ()
|
||||
# Makefile.L3
|
||||
set(USE_TRMM false)
|
||||
string(TOUPPER ${TARGET_CORE} UC_TARGET_CORE)
|
||||
|
@ -209,15 +219,8 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
|||
GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPERFORMANT}" "" "gemm_direct_performant" false "" "" false SINGLE)
|
||||
endif()
|
||||
|
||||
foreach (float_type SINGLE DOUBLE BFLOAT16)
|
||||
foreach (float_type SINGLE DOUBLE)
|
||||
string(SUBSTRING ${float_type} 0 1 float_char)
|
||||
if (${float_type} STREQUAL "BFLOAT16")
|
||||
if (NOT ${BUILD_BFLOAT16})
|
||||
continue ()
|
||||
else ()
|
||||
set (float_char "SB")
|
||||
endif ()
|
||||
endif ()
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type})
|
||||
endforeach()
|
||||
if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE)
|
||||
|
@ -253,11 +256,24 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
|||
GenerateNamedObjects("${KERNELDIR}/${SGEMM_BETA}" "" "gemm_beta" false "" "" false "SINGLE")
|
||||
endif ()
|
||||
|
||||
if (BUILD_BFLOAT16)
|
||||
if (SBGEMMINCOPY)
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMMINCOPY}" "" "${SBGEMMINCOPYOBJ}" false "" "" true "BFLOAT16")
|
||||
endif ()
|
||||
if (SBGEMMITCOPY)
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMMITCOPY}" "" "${SBGEMMITCOPYOBJ}" false "" "" true "BFLOAT16")
|
||||
endif ()
|
||||
if (SBGEMMONCOPY)
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMMONCOPY}" "" "${SBGEMMONCOPYOBJ}" false "" "" true "BFLOAT16")
|
||||
endif ()
|
||||
if (SBGEMMOTCOPY)
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMMOTCOPY}" "" "${SBGEMMOTCOPYOBJ}" false "" "" true "BFLOAT16")
|
||||
endif ()
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMMKERNEL}" "" "gemm_kernel" false "" "" false "BFLOAT16")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_BETA}" "" "gemm_beta" false "" "" false "BFLOAT16")
|
||||
endif ()
|
||||
foreach (float_type ${FLOAT_TYPES})
|
||||
string(SUBSTRING ${float_type} 0 1 float_char)
|
||||
if (${float_type} STREQUAL "BFLOAT16")
|
||||
set (float_char "SB")
|
||||
endif ()
|
||||
if (${float_char}GEMMINCOPY)
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMINCOPY}" "${float_type}" "${${float_char}GEMMINCOPYOBJ}" false "" "" true ${float_type})
|
||||
endif ()
|
||||
|
@ -568,6 +584,44 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
|||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false ${float_type})
|
||||
endif ()
|
||||
if (BUILD_BFLOAT16)
|
||||
if (NOT DEFINED SBGEMM_SMALL_M_PERMIT)
|
||||
set(SBGEMM_SMALL_M_PERMIT ../generic/gemm_small_matrix_permit.c)
|
||||
endif ()
|
||||
if (NOT DEFINED SBGEMM_SMALL_K_NN)
|
||||
set(SBGEMM_SMALL_K_NN ../generic/gemm_small_matrix_kernel_nn.c)
|
||||
endif ()
|
||||
if (NOT DEFINED SBGEMM_SMALL_K_NT)
|
||||
set(SBGEMM_SMALL_K_NT ../generic/gemm_small_matrix_kernel_nt.c)
|
||||
endif ()
|
||||
if (NOT DEFINED SBGEMM_SMALL_K_TN)
|
||||
set(SBGEMM_SMALL_K_TN ../generic/gemm_small_matrix_kernel_tn.c)
|
||||
endif ()
|
||||
if (NOT DEFINED SBGEMM_SMALL_K_TT)
|
||||
set(SBGEMM_SMALL_K_TT ../generic/gemm_small_matrix_kernel_tt.c)
|
||||
endif ()
|
||||
if (NOT DEFINED SBGEMM_SMALL_K_B0_NN)
|
||||
set(SBGEMM_SMALL_K_B0_NN ../generic/gemm_small_matrix_kernel_nn.c)
|
||||
endif ()
|
||||
if (NOT DEFINED SBGEMM_SMALL_K_B0_NT)
|
||||
set(SBGEMM_SMALL_K_B0_NT ../generic/gemm_small_matrix_kernel_nt.c)
|
||||
endif ()
|
||||
if (NOT DEFINED SBGEMM_SMALL_K_B0_TN)
|
||||
set(SBGEMM_SMALL_K_B0_TN ../generic/gemm_small_matrix_kernel_tn.c)
|
||||
endif ()
|
||||
if (NOT DEFINED SBGEMM_SMALL_K_B0_TT)
|
||||
set($SBGEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_tt.c)
|
||||
endif ()
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_M_PERMIT}" "" "gemm_small_matrix_permit" false "" "" false "BFLOAT16")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false "BFLOAT16")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NT}" "" "gemm_small_kernel_nt" false "" "" false "BFLOAT16")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_TN}" "" "gemm_small_kernel_tn" false "" "" false "BFLOAT16")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NT}" "" "gemm_small_kernel_tt" false "" "" false "BFLOAT16")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NN}" "B0" "gemm_small_kernel_b0_nn" false "" "" false "BFLOAT16")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_nt" false "" "" false "BFLOAT16")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false "BFLOAT16")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false "BFLOAT16")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (NOT DEFINED ${float_char}OMATCOPY_CN)
|
||||
|
@ -702,6 +756,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
|||
#geadd
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEADD_KERNEL}" "" "geadd_k" false "" "" false ${float_type})
|
||||
endforeach ()
|
||||
|
||||
if (BUILD_DOUBLE AND NOT BUILD_SINGLE)
|
||||
GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_LN}" "UPPER;LN;TRSMKERNEL" "trsm_kernel_LN" false "" "" false "SINGLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_LT}" "LT;TRSMKERNEL" "trsm_kernel_LT" false "" "" false "SINGLE")
|
||||
|
|
|
@ -50,11 +50,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define B03 x16
|
||||
#define B04 x17
|
||||
|
||||
#define I x18
|
||||
#define J x19
|
||||
#define I x19
|
||||
#define J x20
|
||||
|
||||
#define TEMP1 x20
|
||||
#define TEMP2 x21
|
||||
#define TEMP1 x21
|
||||
|
||||
#define A_PREFETCH 2560
|
||||
#define B_PREFETCH 256
|
||||
|
|
|
@ -49,9 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define pCRow3 x15
|
||||
#define pA x16
|
||||
#define alpha x17
|
||||
#define temp x18
|
||||
//#define temp x18
|
||||
#define tempOffset x19
|
||||
#define tempK x20
|
||||
#define temp x21
|
||||
|
||||
#define alpha0 d10
|
||||
#define alphaV0 v10.d[0]
|
||||
|
|
|
@ -30,7 +30,7 @@ All rights reserved.
|
|||
#define B00 x22
|
||||
|
||||
|
||||
#define I x18
|
||||
#define I x21
|
||||
#define J x19
|
||||
|
||||
#define TEMP1 x20
|
||||
|
|
|
@ -49,9 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define pCRow3 x15
|
||||
#define pA x16
|
||||
#define alpha w17
|
||||
#define temp x18
|
||||
//#define temp x18
|
||||
#define tempOffset x19
|
||||
#define tempK x20
|
||||
#define temp x21
|
||||
|
||||
#define alpha0 s10
|
||||
#define alphaV0 v10.s[0]
|
||||
|
|
|
@ -48,8 +48,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define pCRow2 x14
|
||||
#define pCRow3 x15
|
||||
#define pA x16
|
||||
#define alphaR x17
|
||||
#define alphaI x18
|
||||
#define alphaR x19
|
||||
#define alphaI x20
|
||||
|
||||
#define alpha0_R d10
|
||||
#define alphaV0_R v10.d[0]
|
||||
|
|
|
@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define pCRow3 x15
|
||||
#define pA x16
|
||||
#define alphaR x17
|
||||
#define alphaI x18
|
||||
#define alphaI x22
|
||||
#define temp x19
|
||||
#define tempOffset x20
|
||||
#define tempK x21
|
||||
|
|
|
@ -47,7 +47,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
|
||||
if ( (inc_x == 1) && (inc_y == 1) )
|
||||
{
|
||||
int n1 = n & -4;
|
||||
#if V_SIMD && !defined(DSDOT)
|
||||
const int vstep = v_nlanes_f32;
|
||||
const int unrollx4 = n & (-vstep * 4);
|
||||
|
@ -84,6 +83,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
}
|
||||
dot = v_sum_f32(vsum0);
|
||||
#elif defined(DSDOT)
|
||||
int n1 = n & -4;
|
||||
for (; i < n1; i += 4)
|
||||
{
|
||||
dot += (double) y[i] * (double) x[i]
|
||||
|
@ -92,6 +92,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
+ (double) y[i+3] * (double) x[i+3] ;
|
||||
}
|
||||
#else
|
||||
int n1 = n & -4;
|
||||
for (; i < n1; i += 4)
|
||||
{
|
||||
dot += y[i] * x[i]
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__)
|
||||
ifeq ($(HAVE_GAS), 1)
|
||||
include $(KERNELDIR)/KERNEL.POWER8
|
||||
else
|
||||
|
||||
#SGEMM_BETA = ../generic/gemm_beta.c
|
||||
#DGEMM_BETA = ../generic/gemm_beta.c
|
||||
#CGEMM_BETA = ../generic/zgemm_beta.c
|
||||
|
@ -44,6 +43,7 @@ DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
|||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_power10.S
|
||||
#CGEMMKERNEL = cgemm_kernel_8x4_power8.S
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
||||
|
@ -218,5 +218,4 @@ QCABS_KERNEL = ../generic/cabs.c
|
|||
#Dump kernel
|
||||
CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
|
||||
ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
|
||||
|
||||
endif
|
||||
|
|
|
@ -36,9 +36,12 @@ static void caxpy_kernel_8 (long n, float *x, float *y,
|
|||
#endif
|
||||
const float *mvecp = mvec;
|
||||
/* We have to load reverse mask for big endian. */
|
||||
/* __vector unsigned char mask={ 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}; */
|
||||
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
__vector unsigned char mask={ 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11};
|
||||
#else
|
||||
__vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4};
|
||||
#endif
|
||||
|
||||
long ytmp;
|
||||
|
||||
__asm__
|
||||
|
@ -112,6 +115,16 @@ static void caxpy_kernel_8 (long n, float *x, float *y,
|
|||
"xvmaddasp 38, 58, 33 \n\t"
|
||||
"xvmaddasp 39, 59, 33 \n\t"
|
||||
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"stxv 48, 0(%4) \n\t"
|
||||
"stxv 49, 16(%4) \n\t"
|
||||
"stxv 50, 32(%4) \n\t"
|
||||
"stxv 51, 48(%4) \n\t"
|
||||
"stxv 34, 64(%4) \n\t"
|
||||
"stxv 35, 80(%4) \n\t"
|
||||
"stxv 38, 96(%4) \n\t"
|
||||
"stxv 39, 112(%4) \n\t"
|
||||
#else
|
||||
"stxv 49, 0(%4) \n\t"
|
||||
"stxv 48, 16(%4) \n\t"
|
||||
"stxv 51, 32(%4) \n\t"
|
||||
|
@ -120,6 +133,7 @@ static void caxpy_kernel_8 (long n, float *x, float *y,
|
|||
"stxv 34, 80(%4) \n\t"
|
||||
"stxv 39, 96(%4) \n\t"
|
||||
"stxv 38, 112(%4) \n\t"
|
||||
#endif
|
||||
|
||||
"addi %4, %4, 128 \n\t"
|
||||
"xxperm 52, 40, %x10 \n\t" // exchange real and imag part
|
||||
|
@ -163,6 +177,16 @@ static void caxpy_kernel_8 (long n, float *x, float *y,
|
|||
"xvmaddasp 38, 58, 33 \n\t"
|
||||
"xvmaddasp 39, 59, 33 \n\t"
|
||||
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"stxv 48, 0(%4) \n\t"
|
||||
"stxv 49, 16(%4) \n\t"
|
||||
"stxv 50, 32(%4) \n\t"
|
||||
"stxv 51, 48(%4) \n\t"
|
||||
"stxv 34, 64(%4) \n\t"
|
||||
"stxv 35, 80(%4) \n\t"
|
||||
"stxv 38, 96(%4) \n\t"
|
||||
"stxv 39, 112(%4) \n\t"
|
||||
#else
|
||||
"stxv 49, 0(%4) \n\t"
|
||||
"stxv 48, 16(%4) \n\t"
|
||||
"stxv 51, 32(%4) \n\t"
|
||||
|
@ -171,6 +195,7 @@ static void caxpy_kernel_8 (long n, float *x, float *y,
|
|||
"stxv 34, 80(%4) \n\t"
|
||||
"stxv 39, 96(%4) \n\t"
|
||||
"stxv 38, 112(%4) \n\t"
|
||||
#endif
|
||||
|
||||
"#n=%1 x=%5=%2 y=%0=%3 alpha=(%7,%8) mvecp=%6=%9 ytmp=%4\n"
|
||||
:
|
||||
|
|
|
@ -46,7 +46,16 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
|
|||
|
||||
".align 5 \n"
|
||||
"one%=: \n\t"
|
||||
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"stxv 32, 0(%3) \n\t"
|
||||
"stxv 33, 16(%3) \n\t"
|
||||
"stxv 34, 32(%3) \n\t"
|
||||
"stxv 35, 48(%3) \n\t"
|
||||
"stxv 36, 64(%3) \n\t"
|
||||
"stxv 37, 80(%3) \n\t"
|
||||
"stxv 38, 96(%3) \n\t"
|
||||
"stxv 39, 112(%3) \n\t"
|
||||
#else
|
||||
"stxv 33, 0(%3) \n\t"
|
||||
"stxv 32, 16(%3) \n\t"
|
||||
"stxv 35, 32(%3) \n\t"
|
||||
|
@ -55,11 +64,21 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
|
|||
"stxv 36, 80(%3) \n\t"
|
||||
"stxv 39, 96(%3) \n\t"
|
||||
"stxv 38, 112(%3) \n\t"
|
||||
#endif
|
||||
"lxvp 32, 0(%2) \n\t"
|
||||
"lxvp 34, 32(%2) \n\t"
|
||||
"lxvp 36, 64(%2) \n\t"
|
||||
"lxvp 38, 96(%2) \n\t"
|
||||
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"stxv 40, 128(%3) \n\t"
|
||||
"stxv 41, 144(%3) \n\t"
|
||||
"stxv 42, 160(%3) \n\t"
|
||||
"stxv 43, 176(%3) \n\t"
|
||||
"stxv 44, 192(%3) \n\t"
|
||||
"stxv 45, 208(%3) \n\t"
|
||||
"stxv 46, 224(%3) \n\t"
|
||||
"stxv 47, 240(%3) \n\t"
|
||||
#else
|
||||
"stxv 41, 128(%3) \n\t"
|
||||
"stxv 40, 144(%3) \n\t"
|
||||
"stxv 43, 160(%3) \n\t"
|
||||
|
@ -68,6 +87,7 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
|
|||
"stxv 44, 208(%3) \n\t"
|
||||
"stxv 47, 224(%3) \n\t"
|
||||
"stxv 46, 240(%3) \n\t"
|
||||
#endif
|
||||
"lxvp 40, 128(%2) \n\t"
|
||||
"lxvp 42, 160(%2) \n\t"
|
||||
"lxvp 44, 192(%2) \n\t"
|
||||
|
@ -81,7 +101,24 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
|
|||
"bgt one%= \n"
|
||||
|
||||
"two%=: \n\t"
|
||||
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"stxv 32, 0(%3) \n\t"
|
||||
"stxv 33, 16(%3) \n\t"
|
||||
"stxv 34, 32(%3) \n\t"
|
||||
"stxv 35, 48(%3) \n\t"
|
||||
"stxv 36, 64(%3) \n\t"
|
||||
"stxv 37, 80(%3) \n\t"
|
||||
"stxv 38, 96(%3) \n\t"
|
||||
"stxv 39, 112(%3) \n\t"
|
||||
"stxv 40, 128(%3) \n\t"
|
||||
"stxv 41, 144(%3) \n\t"
|
||||
"stxv 42, 160(%3) \n\t"
|
||||
"stxv 43, 176(%3) \n\t"
|
||||
"stxv 44, 192(%3) \n\t"
|
||||
"stxv 45, 208(%3) \n\t"
|
||||
"stxv 46, 224(%3) \n\t"
|
||||
"stxv 47, 240(%3) \n\t"
|
||||
#else
|
||||
"stxv 33, 0(%3) \n\t"
|
||||
"stxv 32, 16(%3) \n\t"
|
||||
"stxv 35, 32(%3) \n\t"
|
||||
|
@ -98,7 +135,7 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
|
|||
"stxv 44, 208(%3) \n\t"
|
||||
"stxv 47, 224(%3) \n\t"
|
||||
"stxv 46, 240(%3) \n\t"
|
||||
|
||||
#endif
|
||||
"#n=%1 x=%4=%2 y=%0=%3"
|
||||
:
|
||||
"=m" (*y),
|
||||
|
|
|
@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#else
|
||||
|
||||
#include "common.h"
|
||||
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||
#if defined(POWER10)
|
||||
#include "cdot_microk_power10.c"
|
||||
#else
|
||||
#ifndef HAVE_KERNEL_8
|
||||
|
@ -120,7 +120,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
|
|||
|
||||
if ((inc_x == 1) && (inc_y == 1)) {
|
||||
|
||||
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||
#if defined(POWER10)
|
||||
BLASLONG n1 = n & -16;
|
||||
#else
|
||||
BLASLONG n1 = n & -8;
|
||||
|
|
|
@ -29,7 +29,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
static void cdot_kernel_8 (long n, float *x, float *y, float *dot)
|
||||
{
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
__vector unsigned char mask = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
|
||||
#else
|
||||
__vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4};
|
||||
#endif
|
||||
__asm__
|
||||
(
|
||||
"dcbt 0, %2 \n\t"
|
||||
|
@ -153,7 +157,11 @@ static void cdot_kernel_8 (long n, float *x, float *y, float *dot)
|
|||
"xxswapd 33, 34 \n\t"
|
||||
"xvaddsp 35, 35, 32 \n\t"
|
||||
"xvaddsp 34, 34, 33 \n\t"
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"xxpermdi 34, 35, 34, 0 \n\t"
|
||||
#else
|
||||
"xxpermdi 34, 34, 35, 2 \n\t"
|
||||
#endif
|
||||
"stxv 34, 0(%6) \n\t"
|
||||
|
||||
"#n=%1 x=%4=%2 y=%5=%3 dot=%0=%6"
|
||||
|
|
|
@ -76,11 +76,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#include "cgemm_macros_power10.S"
|
||||
|
||||
#if (_AIX)
|
||||
.set perm_const1, 0x0405060700010203
|
||||
.set perm_const2, 0x0c0d0e0f08090a0b
|
||||
.set save_permute_12, 0x1011121300010203
|
||||
.set save_permute_11, 0x18191a1b08090a0b
|
||||
#else
|
||||
.equ perm_const1, 0x0405060700010203
|
||||
.equ perm_const2, 0x0c0d0e0f08090a0b
|
||||
.equ save_permute_12, 0x0c0d0e0f1c1d1e1f
|
||||
.equ save_permute_11, 0x0405060714151617
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef NEEDPARAM
|
||||
|
@ -172,24 +178,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
/*load reverse permute mask for big endian
|
||||
uint128 = 0xc0d0e0f08090a0b0405060700010203
|
||||
*/
|
||||
#if (_AIX)
|
||||
lis T2, (perm_const2>>48 & 0xFFFF)
|
||||
lis T1, (perm_const1>>48 & 0xFFFF)
|
||||
lis T3, (save_permute_12>>48 & 0xFFFF)
|
||||
lis T4, (save_permute_11>>48 & 0xFFFF)
|
||||
|
||||
ori T2, T2, (perm_const2>>32 & 0xFFFF)
|
||||
ori T1, T1, (perm_const1>>32 & 0xFFFF)
|
||||
ori T3, T3, (save_permute_12>>32 & 0xFFFF)
|
||||
ori T4, T4, (save_permute_11>>32 & 0xFFFF)
|
||||
#else
|
||||
lis T2, perm_const2@highest
|
||||
lis T1, perm_const1@highest
|
||||
lis T3, save_permute_12@highest
|
||||
lis T4, save_permute_11@highest
|
||||
|
||||
|
||||
ori T2, T2, perm_const2@higher
|
||||
ori T1, T1, perm_const1@higher
|
||||
ori T3, T3, save_permute_12@higher
|
||||
ori T4, T4, save_permute_11@higher
|
||||
|
||||
#endif
|
||||
|
||||
rldicr T2, T2, 32, 31
|
||||
rldicr T1, T1, 32, 31
|
||||
rldicr T3, T3, 32, 31
|
||||
rldicr T4, T4, 32, 31
|
||||
|
||||
#if (_AIX)
|
||||
oris T2, T2, (perm_const2>>16 & 0xFFFF)
|
||||
oris T1, T1, (perm_const1>>16 & 0xFFFF)
|
||||
oris T3, T3, (save_permute_12>>16 & 0xFFFF)
|
||||
oris T4, T4, (save_permute_11>>16 & 0xFFFF)
|
||||
|
||||
ori T2, T2, (perm_const2 & 0xFFFF)
|
||||
ori T1, T1, (perm_const1 & 0xFFFF)
|
||||
ori T3, T3, (save_permute_12 & 0xFFFF)
|
||||
ori T4, T4, (save_permute_11 & 0xFFFF)
|
||||
#else
|
||||
oris T2, T2, perm_const2@h
|
||||
oris T1, T1, perm_const1@h
|
||||
oris T3, T3, save_permute_12@h
|
||||
|
@ -200,7 +226,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ori T1, T1, perm_const1@l
|
||||
ori T3, T3, save_permute_12@l
|
||||
ori T4, T4, save_permute_11@l
|
||||
|
||||
#endif
|
||||
|
||||
li r0,0
|
||||
li PRE,512
|
||||
|
|
|
@ -218,6 +218,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.if \OffsetA != 0
|
||||
addi \AREG, \AREG, \OffsetA
|
||||
.endif
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xvf32gerpp 3, 36, 34
|
||||
xvf32gerpp 2, 37, 34
|
||||
xvf32gerpp 1, 32, 34
|
||||
xvf32gerpp 0, 33, 34
|
||||
xvf32gerpp 7, 36, 35
|
||||
xvf32gerpp 6, 37, 35
|
||||
xvf32gerpp 5, 32, 35
|
||||
xvf32gerpp 4, 33, 35
|
||||
#else
|
||||
xvf32gerpp 3, 36, 35
|
||||
xvf32gerpp 2, 37, 35
|
||||
xvf32gerpp 1, 32, 35
|
||||
|
@ -226,6 +236,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xvf32gerpp 6, 37, 34
|
||||
xvf32gerpp 5, 32, 34
|
||||
xvf32gerpp 4, 33, 34
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro LOAD4x8_2
|
||||
|
@ -255,6 +266,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL4x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
|
||||
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
xvf32gerpp 3, 36, 34
|
||||
xvf32gerpp 2, 37, 34
|
||||
xvf32gerpp 1, 32, 34
|
||||
xvf32gerpp 0, 33, 34
|
||||
xvf32gerpp 7, 36, 35
|
||||
xvf32gerpp 6, 37, 35
|
||||
xvf32gerpp 5, 32, 35
|
||||
xvf32gerpp 4, 33, 35
|
||||
#else
|
||||
xvf32gerpp 3, 36, 35
|
||||
xvf32gerpp 2, 37, 35
|
||||
xvf32gerpp 1, 32, 35
|
||||
|
@ -263,11 +284,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xvf32gerpp 6, 37, 34
|
||||
xvf32gerpp 5, 32, 34
|
||||
xvf32gerpp 4, 33, 34
|
||||
#endif
|
||||
.if \Complete==0
|
||||
lxvp vs34, DISP8(\Index, \OffsetB)(\BREG)
|
||||
lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG)
|
||||
lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG)
|
||||
.endif
|
||||
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
xvf32gerpp 3, 42, 38
|
||||
xvf32gerpp 2, 43, 38
|
||||
xvf32gerpp 1, 40, 38
|
||||
xvf32gerpp 0, 41, 38
|
||||
xvf32gerpp 7, 42, 39
|
||||
xvf32gerpp 6, 43, 39
|
||||
xvf32gerpp 5, 40, 39
|
||||
xvf32gerpp 4, 41, 39
|
||||
#else
|
||||
xvf32gerpp 3, 42, 39
|
||||
xvf32gerpp 2, 43, 39
|
||||
xvf32gerpp 1, 40, 39
|
||||
|
@ -276,6 +308,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xvf32gerpp 6, 43, 38
|
||||
xvf32gerpp 5, 40, 38
|
||||
xvf32gerpp 4, 41, 38
|
||||
#endif
|
||||
.if \Complete==0
|
||||
lxvp vs40, DISP16(\Index, 64+\OffsetA)(\AREG)
|
||||
lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG)
|
||||
|
@ -393,22 +426,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
RECONSTRUCT_PAIR2
|
||||
#ifndef TRMMKERNEL
|
||||
/* add */
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxpermdi vs1, vs0, vs8, 1
|
||||
xxpermdi vs3, vs2, vs10, 1
|
||||
xxpermdi vs5, vs4, vs12, 1
|
||||
xxpermdi vs7, vs6, vs14, 1
|
||||
xxpermdi vs9, vs8, vs0, 1
|
||||
xxpermdi vs11, vs10, vs2, 1
|
||||
#else
|
||||
xxpermdi vs1, vs8, vs0, 2
|
||||
xxpermdi vs3, vs10, vs2, 2
|
||||
xxpermdi vs5, vs12, vs4, 2
|
||||
xxpermdi vs7, vs14, vs6, 2
|
||||
xxpermdi vs9, vs0, vs8, 2
|
||||
xxpermdi vs11, vs2, vs10, 2
|
||||
#endif
|
||||
xvaddsp vs24, vs24, vs3
|
||||
xvaddsp vs25, vs25, vs1
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxpermdi vs13, vs12, vs4, 1
|
||||
xxpermdi vs15, vs14, vs6, 1
|
||||
#else
|
||||
xxpermdi vs13, vs4, vs12, 2
|
||||
xxpermdi vs15, vs6, vs14, 2
|
||||
#endif
|
||||
xvaddsp vs26, vs26, vs7
|
||||
xvaddsp vs27, vs27, vs5
|
||||
xvaddsp vs28, vs28, vs11
|
||||
xvaddsp vs29, vs29, vs9
|
||||
xvaddsp vs30, vs30, vs15
|
||||
xvaddsp vs31, vs31, vs13
|
||||
#else
|
||||
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
xxpermdi vs25, vs0, vs8, 1
|
||||
xxpermdi vs24, vs2, vs10, 1
|
||||
xxpermdi vs27, vs4, vs12, 1
|
||||
xxpermdi vs26, vs6, vs14, 1
|
||||
xxpermdi vs29, vs8, vs0, 1
|
||||
xxpermdi vs28, vs10, vs2, 1
|
||||
xxpermdi vs31, vs12, vs4, 1
|
||||
xxpermdi vs30, vs14, vs6, 1
|
||||
#else
|
||||
xxpermdi vs25, vs8, vs0, 2
|
||||
xxpermdi vs24, vs10, vs2, 2
|
||||
|
@ -418,6 +475,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xxpermdi vs28, vs2, vs10, 2
|
||||
xxpermdi vs31, vs4, vs12, 2
|
||||
xxpermdi vs30, vs6, vs14, 2
|
||||
#endif
|
||||
#endif
|
||||
stxvp vs24, 0(CO)
|
||||
MULT_APLHA_PART1 vs48, vs56, vs0, vs1
|
||||
|
@ -443,22 +501,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
RECONSTRUCT_PAIR2
|
||||
#ifndef TRMMKERNEL
|
||||
/* add */
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxpermdi vs1, vs0, vs8, 1
|
||||
xxpermdi vs3, vs2, vs10, 1
|
||||
xxpermdi vs5, vs4, vs12, 1
|
||||
xxpermdi vs7, vs6, vs14, 1
|
||||
xxpermdi vs9, vs8, vs0, 1
|
||||
xxpermdi vs11, vs10, vs2, 1
|
||||
#else
|
||||
xxpermdi vs1, vs8, vs0, 2
|
||||
xxpermdi vs3, vs10, vs2, 2
|
||||
xxpermdi vs5, vs12, vs4, 2
|
||||
xxpermdi vs7, vs14, vs6, 2
|
||||
xxpermdi vs9, vs0, vs8, 2
|
||||
xxpermdi vs11, vs2, vs10, 2
|
||||
#endif
|
||||
xvaddsp vs32, vs32, vs3
|
||||
xvaddsp vs33, vs33, vs1
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxpermdi vs13, vs12, vs4, 1
|
||||
xxpermdi vs15, vs14, vs6, 1
|
||||
#else
|
||||
xxpermdi vs13, vs4, vs12, 2
|
||||
xxpermdi vs15, vs6, vs14, 2
|
||||
#endif
|
||||
xvaddsp vs40, vs40, vs7
|
||||
xvaddsp vs41, vs41, vs5
|
||||
xvaddsp vs34, vs34, vs11
|
||||
xvaddsp vs35, vs35, vs9
|
||||
xvaddsp vs42, vs42, vs15
|
||||
xvaddsp vs43, vs43, vs13
|
||||
#else
|
||||
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
xxpermdi vs33, vs0, vs8, 1
|
||||
xxpermdi vs32, vs2, vs10, 1
|
||||
xxpermdi vs41, vs4, vs12, 1
|
||||
xxpermdi vs40, vs6, vs14, 1
|
||||
xxpermdi vs35, vs8, vs0, 1
|
||||
xxpermdi vs34, vs10, vs2, 1
|
||||
xxpermdi vs43, vs12, vs4, 1
|
||||
xxpermdi vs42, vs14, vs6, 1
|
||||
#else
|
||||
xxpermdi vs33, vs8, vs0, 2
|
||||
xxpermdi vs32, vs10, vs2, 2
|
||||
|
@ -468,6 +550,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xxpermdi vs34, vs2, vs10, 2
|
||||
xxpermdi vs43, vs4, vs12, 2
|
||||
xxpermdi vs42, vs6, vs14, 2
|
||||
#endif
|
||||
#endif
|
||||
stxvp vs32, 0(T2)
|
||||
stxvp vs40, 32(T2)
|
||||
|
@ -510,10 +593,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.if \OffsetA != 0
|
||||
addi \AREG, \AREG, \OffsetA
|
||||
.endif
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xvf32gerpp 3, 32, 35
|
||||
xvf32gerpp 2, 33, 35
|
||||
xvf32gerpp 1, 32, 34
|
||||
xvf32gerpp 0, 33, 34
|
||||
#else
|
||||
xvf32gerpp 3, 32, 34
|
||||
xvf32gerpp 2, 33, 34
|
||||
xvf32gerpp 1, 32, 35
|
||||
xvf32gerpp 0, 33, 35
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro LOAD4x4_2
|
||||
|
@ -541,18 +631,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL4x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xvf32gerpp 3, 32, 35
|
||||
xvf32gerpp 2, 33, 35
|
||||
xvf32gerpp 1, 32, 34
|
||||
xvf32gerpp 0, 33, 34
|
||||
#else
|
||||
xvf32gerpp 3, 32, 34
|
||||
xvf32gerpp 2, 33, 34
|
||||
xvf32gerpp 1, 32, 35
|
||||
xvf32gerpp 0, 33, 35
|
||||
#endif
|
||||
.if \Complete==0
|
||||
lxvp vs34, DISP8(\Index, \OffsetB)(\BREG)
|
||||
lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG)
|
||||
.endif
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xvf32gerpp 3, 36, 39
|
||||
xvf32gerpp 2, 37, 39
|
||||
xvf32gerpp 1, 36, 38
|
||||
xvf32gerpp 0, 37, 38
|
||||
#else
|
||||
xvf32gerpp 3, 36, 38
|
||||
xvf32gerpp 2, 37, 38
|
||||
xvf32gerpp 1, 36, 39
|
||||
xvf32gerpp 0, 37, 39
|
||||
#endif
|
||||
.if \Complete==0
|
||||
lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG)
|
||||
lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG)
|
||||
|
@ -606,6 +710,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
RECONSTRUCT_PAIR2
|
||||
#ifndef TRMMKERNEL
|
||||
/* add */
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxpermdi vs1, vs0, vs8, 1
|
||||
xxpermdi vs3, vs2, vs10, 1
|
||||
xxpermdi vs9, vs8, vs0, 1
|
||||
xxpermdi vs11, vs10, vs2, 1
|
||||
xxpermdi vs5, vs4, vs12, 1
|
||||
xxpermdi vs7, vs6, vs14, 1
|
||||
xxpermdi vs13, vs12, vs4, 1
|
||||
xxpermdi vs15, vs14, vs6, 1
|
||||
#else
|
||||
xxpermdi vs1, vs8, vs0, 2
|
||||
xxpermdi vs3, vs10, vs2, 2
|
||||
xxpermdi vs9, vs0, vs8, 2
|
||||
|
@ -614,6 +728,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xxpermdi vs7, vs14, vs6, 2
|
||||
xxpermdi vs13, vs4, vs12, 2
|
||||
xxpermdi vs15, vs6, vs14, 2
|
||||
#endif
|
||||
xvaddsp vs24, vs24, vs3
|
||||
xvaddsp vs25, vs25, vs1
|
||||
xvaddsp vs26, vs26, vs11
|
||||
|
@ -622,6 +737,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xvaddsp vs29, vs29, vs5
|
||||
xvaddsp vs30, vs30, vs15
|
||||
xvaddsp vs31, vs31, vs13
|
||||
#else
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxpermdi vs25, vs0, vs8, 1
|
||||
xxpermdi vs24, vs2, vs10, 1
|
||||
xxpermdi vs27, vs8, vs0, 1
|
||||
xxpermdi vs26, vs10, vs2, 1
|
||||
xxpermdi vs29, vs4, vs12, 1
|
||||
xxpermdi vs28, vs6, vs14, 1
|
||||
xxpermdi vs31, vs12, vs4, 1
|
||||
xxpermdi vs30, vs14, vs6, 1
|
||||
#else
|
||||
xxpermdi vs25, vs8, vs0, 2
|
||||
xxpermdi vs24, vs10, vs2, 2
|
||||
|
@ -631,6 +756,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xxpermdi vs28, vs14, vs6, 2
|
||||
xxpermdi vs31, vs4, vs12, 2
|
||||
xxpermdi vs30, vs6, vs14, 2
|
||||
#endif
|
||||
#endif
|
||||
stxvp vs24, 0(CO)
|
||||
stxvp vs26, 0(T1)
|
||||
|
@ -672,8 +798,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.if \OffsetA != 0
|
||||
addi \AREG, \AREG, \OffsetA
|
||||
.endif
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xvf32gerpp 1, 35, 32
|
||||
xvf32gerpp 0, 34, 32
|
||||
#else
|
||||
xvf32gerpp 1, 34, 32
|
||||
xvf32gerpp 0, 35, 32
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro LOAD4x2_2
|
||||
|
@ -700,13 +831,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL4x2_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xvf32gerpp 1, 35, 32
|
||||
xvf32gerpp 0, 34, 32
|
||||
#else
|
||||
xvf32gerpp 1, 34, 33
|
||||
xvf32gerpp 0, 35, 33
|
||||
#endif
|
||||
.if \Complete==0
|
||||
lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG)
|
||||
.endif
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xvf32gerpp 1, 37, 33
|
||||
xvf32gerpp 0, 36, 33
|
||||
#else
|
||||
xvf32gerpp 1, 36, 32
|
||||
xvf32gerpp 0, 37, 32
|
||||
#endif
|
||||
.if \Complete==0
|
||||
lxvp vs32, DISP4(\Index, \OffsetA)(\AREG)
|
||||
lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG)
|
||||
|
@ -757,19 +898,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
RECONSTRUCT_PAIR1
|
||||
#ifndef TRMMKERNEL
|
||||
/* add */
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxpermdi vs1, vs0, vs8, 0
|
||||
xxpermdi vs9, vs2, vs10, 0
|
||||
xxpermdi vs3, vs8, vs0, 3
|
||||
xxpermdi vs11, vs10, vs2, 3
|
||||
#else
|
||||
xxpermdi vs1, vs8, vs0, 0
|
||||
xxpermdi vs9, vs10, vs2, 0
|
||||
xxpermdi vs3, vs0, vs8, 3
|
||||
xxpermdi vs11, vs2, vs10, 3
|
||||
#endif
|
||||
xvaddsp vs24, vs24, vs1
|
||||
xvaddsp vs26, vs26, vs9
|
||||
xvaddsp vs25, vs25, vs3
|
||||
xvaddsp vs27, vs27, vs11
|
||||
#else
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxpermdi vs24, vs0, vs8, 0
|
||||
xxpermdi vs26, vs2, vs10, 0
|
||||
xxpermdi vs25, vs8, vs0, 3
|
||||
xxpermdi vs27, vs10, vs2, 3
|
||||
#else
|
||||
xxpermdi vs24, vs8, vs0, 0
|
||||
xxpermdi vs26, vs10, vs2, 0
|
||||
xxpermdi vs25, vs0, vs8, 3
|
||||
xxpermdi vs27, vs2, vs10, 3
|
||||
#endif
|
||||
#endif
|
||||
stxv vs24, 0(CO)
|
||||
stxv vs25, 0(T1)
|
||||
|
@ -811,8 +966,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.if \OffsetA != 0
|
||||
addi \AREG, \AREG, \OffsetA
|
||||
.endif
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xvf32gerpp 0, 34, 32
|
||||
xvf32gerpp 1, 35, 32
|
||||
#else
|
||||
xvf32gerpp 0, 35, 32
|
||||
xvf32gerpp 1, 34, 32
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro LOAD4x1_2
|
||||
|
@ -822,8 +982,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.macro LOAD4x1_2O OffsetA, OffsetB
|
||||
lxv vs32, (\OffsetA)(AO)
|
||||
vspltisb v6, 0
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxpermdi vs33, vs32, vs38, 2
|
||||
xxpermdi vs32, vs32, vs38, 0
|
||||
#else
|
||||
xxpermdi vs33, vs32, vs38, 0
|
||||
xxpermdi vs32, vs32, vs38, 2
|
||||
#endif
|
||||
lxvp vs34, (0+\OffsetB)(BO)
|
||||
lxvp vs36, (32+\OffsetB)(BO)
|
||||
.endm
|
||||
|
@ -842,18 +1007,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL4x1_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xvf32gerpp 0, 34, 32
|
||||
xvf32gerpp 1, 35, 32
|
||||
#else
|
||||
xvf32gerpp 0, 35, 32
|
||||
xvf32gerpp 1, 34, 32
|
||||
#endif
|
||||
.if \Complete==0
|
||||
lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG)
|
||||
.endif
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xvf32gerpp 0, 36, 33
|
||||
xvf32gerpp 1, 37, 33
|
||||
#else
|
||||
xvf32gerpp 0, 37, 33
|
||||
xvf32gerpp 1, 36, 33
|
||||
#endif
|
||||
.if \Complete==0
|
||||
lxv vs32, DISP2(\Index, \OffsetA)(\AREG)
|
||||
lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG)
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxpermdi vs33, vs32, vs38, 2
|
||||
xxpermdi vs32, vs32, vs38, 0
|
||||
#else
|
||||
xxpermdi vs33, vs32, vs38, 0
|
||||
xxpermdi vs32, vs32, vs38, 2
|
||||
#endif
|
||||
.endif
|
||||
.if \IsLast==1
|
||||
.if \Complete==1
|
||||
|
@ -1001,19 +1181,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL2x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xvf32gerpp 2, 37, 34
|
||||
xvf32gerpp 3, 36, 34
|
||||
xvf32gerpp 0, 33, 34
|
||||
xvf32gerpp 1, 32, 34
|
||||
#else
|
||||
xvf32gerpp 2, 37, 35
|
||||
xvf32gerpp 3, 36, 35
|
||||
xvf32gerpp 0, 33, 35
|
||||
xvf32gerpp 1, 32, 35
|
||||
#endif
|
||||
|
||||
.if \Complete==0
|
||||
lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG)
|
||||
lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG)
|
||||
.endif
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xvf32gerpp 2, 41, 35
|
||||
xvf32gerpp 3, 40, 35
|
||||
xvf32gerpp 0, 39, 35
|
||||
xvf32gerpp 1, 38, 35
|
||||
#else
|
||||
xvf32gerpp 2, 41, 34
|
||||
xvf32gerpp 3, 40, 34
|
||||
xvf32gerpp 0, 39, 34
|
||||
xvf32gerpp 1, 38, 34
|
||||
#endif
|
||||
|
||||
.if \Complete==0
|
||||
lxvp vs34, DISP4(\Index, \OffsetB)(\BREG)
|
||||
|
@ -1068,22 +1262,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
RECONSTRUCT_PAIR2
|
||||
#ifndef TRMMKERNEL
|
||||
/* add */
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxpermdi vs1, vs0, vs8, 1
|
||||
xxpermdi vs3, vs2, vs10, 1
|
||||
xxpermdi vs5, vs4, vs12, 1
|
||||
xxpermdi vs7, vs6, vs14, 1
|
||||
xxpermdi vs9, vs8, vs0, 1
|
||||
xxpermdi vs11, vs10, vs2, 1
|
||||
#else
|
||||
xxpermdi vs1, vs8, vs0, 2
|
||||
xxpermdi vs3, vs10, vs2, 2
|
||||
xxpermdi vs5, vs12, vs4, 2
|
||||
xxpermdi vs7, vs14, vs6, 2
|
||||
xxpermdi vs9, vs0, vs8, 2
|
||||
xxpermdi vs11, vs2, vs10, 2
|
||||
#endif
|
||||
xvaddsp vs24, vs24, vs3
|
||||
xvaddsp vs25, vs25, vs1
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxpermdi vs13, vs12, vs4, 1
|
||||
xxpermdi vs15, vs14, vs6, 1
|
||||
#else
|
||||
xxpermdi vs13, vs4, vs12, 2
|
||||
xxpermdi vs15, vs6, vs14, 2
|
||||
#endif
|
||||
xvaddsp vs26, vs26, vs7
|
||||
xvaddsp vs27, vs27, vs5
|
||||
xvaddsp vs28, vs28, vs11
|
||||
xvaddsp vs29, vs29, vs9
|
||||
xvaddsp vs30, vs30, vs15
|
||||
xvaddsp vs31, vs31, vs13
|
||||
#else
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxpermdi vs25, vs0, vs8, 1
|
||||
xxpermdi vs24, vs2, vs10, 1
|
||||
xxpermdi vs27, vs4, vs12, 1
|
||||
xxpermdi vs26, vs6, vs14, 1
|
||||
xxpermdi vs29, vs8, vs0, 1
|
||||
xxpermdi vs28, vs10, vs2, 1
|
||||
xxpermdi vs31, vs12, vs4, 1
|
||||
xxpermdi vs30, vs14, vs6, 1
|
||||
#else
|
||||
xxpermdi vs25, vs8, vs0, 2
|
||||
xxpermdi vs24, vs10, vs2, 2
|
||||
|
@ -1093,6 +1311,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xxpermdi vs28, vs2, vs10, 2
|
||||
xxpermdi vs31, vs4, vs12, 2
|
||||
xxpermdi vs30, vs6, vs14, 2
|
||||
#endif
|
||||
#endif
|
||||
stxvp vs24, 0(CO)
|
||||
stxvp vs26, 32(CO)
|
||||
|
@ -1161,13 +1380,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL2x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xvf32gerpp 0, 33, 34
|
||||
xvf32gerpp 1, 32, 34
|
||||
#else
|
||||
xvf32gerpp 0, 33, 35
|
||||
xvf32gerpp 1, 32, 35
|
||||
#endif
|
||||
.if \Complete==0
|
||||
lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG)
|
||||
.endif
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xvf32gerpp 0, 37, 35
|
||||
xvf32gerpp 1, 36, 35
|
||||
#else
|
||||
xvf32gerpp 0, 37, 34
|
||||
xvf32gerpp 1, 36, 34
|
||||
#endif
|
||||
|
||||
.if \Complete==0
|
||||
lxvp vs34, DISP4(\Index, \OffsetB)(\BREG)
|
||||
lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG)
|
||||
|
@ -1206,19 +1436,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
RECONSTRUCT_PAIR1
|
||||
#ifndef TRMMKERNEL
|
||||
/* add */
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxpermdi vs1, vs0, vs8, 1
|
||||
xxpermdi vs3, vs2, vs10, 1
|
||||
xxpermdi vs9, vs8, vs0, 1
|
||||
xxpermdi vs11, vs10, vs2, 1
|
||||
#else
|
||||
xxpermdi vs1, vs8, vs0, 2
|
||||
xxpermdi vs3, vs10, vs2, 2
|
||||
xxpermdi vs9, vs0, vs8, 2
|
||||
xxpermdi vs11, vs2, vs10, 2
|
||||
#endif
|
||||
xvaddsp vs24, vs24, vs3
|
||||
xvaddsp vs25, vs25, vs1
|
||||
xvaddsp vs26, vs26, vs11
|
||||
xvaddsp vs27, vs27, vs9
|
||||
#else
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxpermdi vs25, vs0, vs8, 1
|
||||
xxpermdi vs24, vs2, vs10, 1
|
||||
xxpermdi vs27, vs8, vs0, 1
|
||||
xxpermdi vs26, vs10, vs2, 1
|
||||
#else
|
||||
xxpermdi vs25, vs8, vs0, 2
|
||||
xxpermdi vs24, vs10, vs2, 2
|
||||
xxpermdi vs27, vs0, vs8, 2
|
||||
xxpermdi vs26, vs2, vs10, 2
|
||||
#endif
|
||||
#endif
|
||||
stxvp vs24, 0(CO)
|
||||
stxvp vs26, 0(T1)
|
||||
|
@ -1330,13 +1574,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xxperm vs8, vs9, save_permute_1
|
||||
#ifndef TRMMKERNEL
|
||||
/* add */
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxpermdi vs1, vs0, vs8, 0
|
||||
xxpermdi vs9, vs8, vs0, 3
|
||||
#else
|
||||
xxpermdi vs1, vs8, vs0, 0
|
||||
xxpermdi vs9, vs0, vs8, 3
|
||||
#endif
|
||||
xvaddsp vs24, vs24, vs1
|
||||
xvaddsp vs26, vs26, vs9
|
||||
#else
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxpermdi vs24, vs0, vs8, 0
|
||||
xxpermdi vs26, vs8, vs0, 3
|
||||
#else
|
||||
xxpermdi vs24, vs8, vs0, 0
|
||||
xxpermdi vs26, vs0, vs8, 3
|
||||
#endif
|
||||
#endif
|
||||
stxv vs24, 0(CO)
|
||||
stxv vs26, 0(T1)
|
||||
|
@ -1528,8 +1782,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
lxvp vs32, (0+\OffsetA)(AO)
|
||||
lxvp vs36, (32+\OffsetA)(AO)
|
||||
vspltisb v10, 0
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxpermdi vs35, vs34, vs42, 2
|
||||
xxpermdi vs34, vs34, vs42, 0
|
||||
#else
|
||||
xxpermdi vs35, vs34, vs42, 0
|
||||
xxpermdi vs34, vs34, vs42, 2
|
||||
#endif
|
||||
lxvp vs38, (64+\OffsetA)(AO)
|
||||
lxvp vs40, (64+32+\OffsetA)(AO)
|
||||
.endm
|
||||
|
@ -1567,8 +1826,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xvf32gerpp 3, 35, 40
|
||||
.if \Complete==0
|
||||
lxv vs34, DISP2(\Index, \OffsetB)(\BREG)
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxpermdi vs35, vs34, vs42, 2
|
||||
xxpermdi vs34, vs34, vs42, 0
|
||||
#else
|
||||
xxpermdi vs35, vs34, vs42, 0
|
||||
xxpermdi vs34, vs34, vs42, 2
|
||||
#endif
|
||||
lxvp vs40, DISP16(\Index, 64+32+\OffsetA)(\AREG)
|
||||
.endif
|
||||
.if \IsLast==1
|
||||
|
@ -1634,10 +1898,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
MULT_APLHA_PART2 vs34, vs42, vs4, vs5
|
||||
MULT_APLHA_PART2 vs35, vs43, vs6, vs7
|
||||
/* reconstruct r, i pairs*/
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxperm vs0, vs1, save_permute_1
|
||||
xxperm vs2, vs3, save_permute_1
|
||||
xxperm vs4, vs5, save_permute_1
|
||||
xxperm vs6, vs7, save_permute_1
|
||||
#else
|
||||
xxperm vs0, vs1, vs28
|
||||
xxperm vs2, vs3, vs28
|
||||
xxperm vs4, vs5, vs28
|
||||
xxperm vs6, vs7, vs28
|
||||
#endif
|
||||
#ifndef TRMMKERNEL
|
||||
/* add */
|
||||
xvaddsp vs24, vs24, vs2
|
||||
|
@ -1648,10 +1919,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
stxvp vs26, 32(CO)
|
||||
#else
|
||||
/* reconstruct r, i pairs*/
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
stxv vs2, 0(CO)
|
||||
stxv vs0, 16(CO)
|
||||
stxv vs6, 32(CO)
|
||||
stxv vs4, 48(CO)
|
||||
#else
|
||||
stxv vs0, 0(CO)
|
||||
stxv vs2, 16(CO)
|
||||
stxv vs4, 32(CO)
|
||||
stxv vs6, 48(CO)
|
||||
#endif
|
||||
#endif
|
||||
addi CO, CO, 64
|
||||
.endm
|
||||
|
@ -1701,8 +1979,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
lxv vs34, (\OffsetB)(BO)
|
||||
lxvp vs32, (0+\OffsetA)(AO)
|
||||
vspltisb v6, 0
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxpermdi vs35, vs34, vs38, 2
|
||||
xxpermdi vs34, vs34, vs38, 0
|
||||
#else
|
||||
xxpermdi vs35, vs34, vs38, 0
|
||||
xxpermdi vs34, vs34, vs38, 2
|
||||
#endif
|
||||
lxvp vs36, (32+\OffsetA)(AO)
|
||||
.endm
|
||||
|
||||
|
@ -1729,8 +2012,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xvf32gerpp 1, 35, 36
|
||||
.if \Complete==0
|
||||
lxv vs34, DISP2(\Index, \OffsetB)(\BREG)
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxpermdi vs35, vs34, vs38, 2
|
||||
xxpermdi vs34, vs34, vs38, 0
|
||||
#else
|
||||
xxpermdi vs35, vs34, vs38, 0
|
||||
xxpermdi vs34, vs34, vs38, 2
|
||||
#endif
|
||||
lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG)
|
||||
.endif
|
||||
.if \IsLast==1
|
||||
|
@ -1775,8 +2063,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
MULT_APLHA_PART2 vs32, vs40, vs0, vs1
|
||||
MULT_APLHA_PART2 vs33, vs41, vs2, vs3
|
||||
/* reconstruct r, i pairs*/
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxperm vs0, vs1, save_permute_1
|
||||
xxperm vs2, vs3, save_permute_1
|
||||
#else
|
||||
xxperm vs0, vs1, vs28
|
||||
xxperm vs2, vs3, vs28
|
||||
#endif
|
||||
#ifndef TRMMKERNEL
|
||||
/* add */
|
||||
xvaddsp vs24, vs24, vs2
|
||||
|
@ -1784,8 +2077,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
stxvp vs24, 0(CO)
|
||||
#else
|
||||
/* reconstruct r, i pairs*/
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
stxv vs2, 0(CO)
|
||||
stxv vs0, 16(CO)
|
||||
#else
|
||||
stxv vs0, 0(CO)
|
||||
stxv vs2, 16(CO)
|
||||
#endif
|
||||
#endif
|
||||
addi CO, CO, 32
|
||||
.endm
|
||||
|
@ -1904,7 +2202,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
MULT_APLHA_PART1 vs32, vs40, vs0, vs1
|
||||
MULT_APLHA_PART2 vs32, vs40, vs0, vs1
|
||||
/* reconstruct r, i pairs*/
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxperm vs0, vs1, save_permute_1
|
||||
#else
|
||||
xxperm vs0, vs1, vs28
|
||||
#endif
|
||||
#ifndef TRMMKERNEL
|
||||
/* add */
|
||||
xvaddsp vs24, vs24, vs0
|
||||
|
@ -2018,7 +2320,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
MULT_APLHA_PART1 vs32, vs40, vs37, vs1
|
||||
MULT_APLHA_PART2 vs32, vs40, vs37, vs1
|
||||
/* reconstruct r, i pairs*/
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxperm vs37, vs1, save_permute_1
|
||||
#else
|
||||
xxperm vs37, vs1, vs28
|
||||
#endif
|
||||
#ifndef TRMMKERNEL
|
||||
/* add */
|
||||
xvaddsp vs36, vs36, vs37
|
||||
|
|
|
@ -30,7 +30,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
static void zscal_kernel_8 (long n, float *x, float alpha_r, float alpha_i)
|
||||
{
|
||||
__vector float t0 = {-alpha_i, alpha_i, -alpha_i, alpha_i};
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
__vector unsigned char mask = {4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11};
|
||||
#else
|
||||
__vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4};
|
||||
#endif
|
||||
__asm__
|
||||
(
|
||||
"dcbt 0, %2 \n\t"
|
||||
|
|
|
@ -39,10 +39,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#include "cswap_microk_power8.c"
|
||||
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||
#include "cswap_microk_power10.c"
|
||||
#elif defined(POWER10)
|
||||
#include "cswap_microk_power8.c"
|
||||
#include "cswap_microk_power10.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
|
|
@ -49,14 +49,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#include "dasum_microk_power8.c"
|
||||
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||
#include "dasum_microk_power10.c"
|
||||
#elif defined(POWER10)
|
||||
#include "dasum_microk_power8.c"
|
||||
#include "dasum_microk_power10.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_16
|
||||
|
||||
static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1)
|
||||
|
@ -114,7 +111,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
if ( inc_x == 1 )
|
||||
{
|
||||
|
||||
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||
#if defined(POWER10)
|
||||
if ( n >= 32)
|
||||
{
|
||||
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;
|
||||
|
|
|
@ -40,18 +40,27 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
|
|||
XXSPLTD_S(32,%x9,0) // alpha, alpha
|
||||
|
||||
"sldi %6, %13, 3 \n\t" // lda * sizeof (double)
|
||||
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"xvmuldp 34, 40, 32 \n\t" // x0 * alpha, x1 * alpha
|
||||
"xvmuldp 35, 41, 32 \n\t" // x2 * alpha, x3 * alpha
|
||||
#else
|
||||
"xvmuldp 34, 41, 32 \n\t" // x0 * alpha, x1 * alpha
|
||||
"xvmuldp 35, 40, 32 \n\t" // x2 * alpha, x3 * alpha
|
||||
#endif
|
||||
|
||||
"add %4, %3, %6 \n\t" // a0 = ap, a1 = a0 + lda
|
||||
"add %6, %6, %6 \n\t" // 2 * lda
|
||||
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
XXSPLTD_S(32,34,0) // x0 * alpha, x0 * alpha
|
||||
XXSPLTD_S(33,34,1) // x1 * alpha, x1 * alpha
|
||||
XXSPLTD_S(34,35,0) // x2 * alpha, x2 * alpha
|
||||
XXSPLTD_S(35,35,1) // x3 * alpha, x3 * alpha
|
||||
#else
|
||||
XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha
|
||||
XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha
|
||||
XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha
|
||||
XXSPLTD_S(35,35,0) // x3 * alpha, x3 * alpha
|
||||
|
||||
#endif
|
||||
"add %5, %3, %6 \n\t" // a2 = a0 + 2 * lda
|
||||
"add %6, %4, %6 \n\t" // a3 = a1 + 2 * lda
|
||||
|
||||
|
@ -286,6 +295,16 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y
|
|||
|
||||
"add %4, %3, %10 \n\t" // a0 = ap, a1 = a0 + lda
|
||||
"add %10, %10, %10 \n\t" // 2 * lda
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
XXSPLTD_S(32,34,0) // x0 * alpha, x0 * alpha
|
||||
XXSPLTD_S(33,34,1) // x1 * alpha, x1 * alpha
|
||||
XXSPLTD_S(34,35,0) // x2 * alpha, x2 * alpha
|
||||
XXSPLTD_S(35,35,1) // x3 * alpha, x3 * alpha
|
||||
XXSPLTD_S(48,39,0) // x6 * alpha, x6 * alpha
|
||||
XXSPLTD_S(49,39,1) // x7 * alpha, x7 * alpha
|
||||
XXSPLTD_S(39,38,1) // x5 * alpha, x5 * alpha
|
||||
XXSPLTD_S(38,38,0) // x4 * alpha, x4 * alpha
|
||||
#else
|
||||
XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha
|
||||
XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha
|
||||
XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha
|
||||
|
@ -294,6 +313,7 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y
|
|||
XXSPLTD_S(49,39,0) // x7 * alpha, x7 * alpha
|
||||
XXSPLTD_S(39,38,0) // x5 * alpha, x5 * alpha
|
||||
XXSPLTD_S(38,38,1) // x4 * alpha, x4 * alpha
|
||||
#endif
|
||||
|
||||
"add %5, %3, %10 \n\t" // a2 = a0 + 2 * lda
|
||||
"add %6, %4, %10 \n\t" // a3 = a1 + 2 * lda
|
||||
|
@ -319,30 +339,69 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y
|
|||
"one%=: \n\t"
|
||||
|
||||
"lxvp 36, 0( %2) \n\t" // y0, y1
|
||||
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"xvmaddadp 36, 40, 32 \n\t"
|
||||
"xvmaddadp 37, 41, 32 \n\t"
|
||||
#else
|
||||
"xvmaddadp 36, 40, 34 \n\t"
|
||||
"xvmaddadp 37, 41, 34 \n\t"
|
||||
#endif
|
||||
"lxvpx 40, %3, %11 \n\t" // a0[0], a0[1]
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"xvmaddadp 36, 42, 33 \n\t"
|
||||
"xvmaddadp 37, 43, 33 \n\t"
|
||||
#else
|
||||
"xvmaddadp 36, 42, 35 \n\t"
|
||||
"xvmaddadp 37, 43, 35 \n\t"
|
||||
#endif
|
||||
"lxvpx 42, %4, %11 \n\t" // a1[0], a1[1]
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"xvmaddadp 36, 44, 34 \n\t"
|
||||
"xvmaddadp 37, 45, 34 \n\t"
|
||||
#else
|
||||
"xvmaddadp 36, 44, 32 \n\t"
|
||||
"xvmaddadp 37, 45, 32 \n\t"
|
||||
#endif
|
||||
"lxvpx 44, %5, %11 \n\t" // a2[0], a2[1]
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"xvmaddadp 36, 46, 35 \n\t"
|
||||
"xvmaddadp 37, 47, 35 \n\t"
|
||||
#else
|
||||
"xvmaddadp 36, 46, 33 \n\t"
|
||||
"xvmaddadp 37, 47, 33 \n\t"
|
||||
#endif
|
||||
"lxvpx 46, %6, %11 \n\t" // a3[0], a3[1]
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"xvmaddadp 36, 50, 38 \n\t"
|
||||
"xvmaddadp 37, 51, 38 \n\t"
|
||||
#else
|
||||
"xvmaddadp 36, 50, 48 \n\t"
|
||||
"xvmaddadp 37, 51, 48 \n\t"
|
||||
#endif
|
||||
"lxvpx 50, %7, %11 \n\t" // a4[0]
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"xvmaddadp 36, 52, 39 \n\t"
|
||||
"xvmaddadp 37, 53, 39 \n\t"
|
||||
#else
|
||||
"xvmaddadp 36, 52, 49 \n\t"
|
||||
"xvmaddadp 37, 53, 49 \n\t"
|
||||
#endif
|
||||
"lxvpx 52, %8, %11 \n\t" // a5[0]
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"xvmaddadp 36, 54, 48 \n\t"
|
||||
"xvmaddadp 37, 55, 48 \n\t"
|
||||
#else
|
||||
"xvmaddadp 36, 54, 38 \n\t"
|
||||
"xvmaddadp 37, 55, 38 \n\t"
|
||||
#endif
|
||||
"lxvpx 54, %9, %11 \n\t" // a6[0]
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"xvmaddadp 36, 56, 49 \n\t"
|
||||
"xvmaddadp 37, 57, 49 \n\t"
|
||||
#else
|
||||
"xvmaddadp 36, 56, 39 \n\t"
|
||||
"xvmaddadp 37, 57, 39 \n\t"
|
||||
#endif
|
||||
"lxvpx 56, %10, %11 \n\t" // a7[0]
|
||||
"addi %11, %11, 32 \n\t"
|
||||
|
||||
|
@ -355,6 +414,24 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y
|
|||
"two%=: \n\t"
|
||||
|
||||
"lxvp 36, 0( %2) \n\t" // y0, y1
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"xvmaddadp 36, 40, 32 \n\t"
|
||||
"xvmaddadp 37, 41, 32 \n\t"
|
||||
"xvmaddadp 36, 42, 33 \n\t"
|
||||
"xvmaddadp 37, 43, 33 \n\t"
|
||||
"xvmaddadp 36, 44, 34 \n\t"
|
||||
"xvmaddadp 37, 45, 34 \n\t"
|
||||
"xvmaddadp 36, 46, 35 \n\t"
|
||||
"xvmaddadp 37, 47, 35 \n\t"
|
||||
"xvmaddadp 36, 50, 38 \n\t"
|
||||
"xvmaddadp 37, 51, 38 \n\t"
|
||||
"xvmaddadp 36, 52, 39 \n\t"
|
||||
"xvmaddadp 37, 53, 39 \n\t"
|
||||
"xvmaddadp 36, 54, 48 \n\t"
|
||||
"xvmaddadp 37, 55, 48 \n\t"
|
||||
"xvmaddadp 36, 56, 49 \n\t"
|
||||
"xvmaddadp 37, 57, 49 \n\t"
|
||||
#else
|
||||
"xvmaddadp 36, 40, 34 \n\t"
|
||||
"xvmaddadp 37, 41, 34 \n\t"
|
||||
"xvmaddadp 36, 42, 35 \n\t"
|
||||
|
@ -371,6 +448,7 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y
|
|||
"xvmaddadp 37, 55, 38 \n\t"
|
||||
"xvmaddadp 36, 56, 39 \n\t"
|
||||
"xvmaddadp 37, 57, 39 \n\t"
|
||||
#endif
|
||||
"stxvp 36, 0( %2) \n\t" // y0, y1
|
||||
|
||||
:
|
||||
|
|
|
@ -279,34 +279,58 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
|
|||
"lxvp 40, 32(%[y]) \n\t"
|
||||
|
||||
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
XXMRGHD_S(42,34,35)
|
||||
XXMRGLD_S(43,34,35)
|
||||
|
||||
XXMRGHD_S(44,4,5)
|
||||
XXMRGLD_S(45,4,5)
|
||||
#else
|
||||
XXMRGLD_S(42,35,34)
|
||||
XXMRGHD_S(43,35,34)
|
||||
|
||||
XXMRGLD_S(44,5,4)
|
||||
XXMRGHD_S(45,5,4)
|
||||
#endif
|
||||
|
||||
"xvadddp 42,42,43 \n\t"
|
||||
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
XXMRGHD_S(46,6,7)
|
||||
XXMRGLD_S(47,6,7)
|
||||
#else
|
||||
XXMRGLD_S(46,7,6)
|
||||
XXMRGHD_S(47,7,6)
|
||||
|
||||
#endif
|
||||
"xvadddp 44,44,45 \n\t"
|
||||
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
XXMRGHD_S(48,8,9)
|
||||
XXMRGLD_S(49,8,9)
|
||||
#else
|
||||
XXMRGLD_S(48,9,8)
|
||||
XXMRGHD_S(49,9,8)
|
||||
|
||||
#endif
|
||||
"xvadddp 46,46,47 \n\t"
|
||||
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"xvmaddadp 38,42,36 \n\t"
|
||||
"xvmaddadp 39,44,36 \n\t"
|
||||
#else
|
||||
"xvmaddadp 39,42,36 \n\t"
|
||||
"xvmaddadp 38,44,36 \n\t"
|
||||
|
||||
#endif
|
||||
"xvadddp 48,48,49 \n\t"
|
||||
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"xvmaddadp 41,48,36 \n\t"
|
||||
#else
|
||||
"xvmaddadp 41,46,36 \n\t"
|
||||
|
||||
#endif
|
||||
"stxvp 38, 0(%[y]) \n\t"
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"xvmaddadp 40,46,36 \n\t"
|
||||
#else
|
||||
"xvmaddadp 40,48,36 \n\t"
|
||||
#endif
|
||||
"stxvp 40, 32(%[y]) \n\t"
|
||||
|
||||
: [memy] "+m" (*(double (*)[8])y),
|
||||
|
|
|
@ -42,10 +42,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#include "drot_microk_power8.c"
|
||||
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||
#include "drot_microk_power10.c"
|
||||
#elif defined(POWER10)
|
||||
#include "drot_microk_power8.c"
|
||||
#include "drot_microk_power10.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
@ -110,8 +108,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
|||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
FLOAT *x1=x;
|
||||
FLOAT *y1=y;
|
||||
FLOAT temp;
|
||||
|
||||
if ( n <= 0 ) return(0);
|
||||
|
@ -119,7 +115,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
|||
if ( (inc_x == 1) && (inc_y == 1) )
|
||||
{
|
||||
|
||||
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||
#if defined(POWER10)
|
||||
if ( n >= 16 )
|
||||
{
|
||||
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;
|
||||
|
@ -139,7 +135,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
|||
BLASLONG n1 = n & -16;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
drot_kernel_16(n1, x1, y1, c, s);
|
||||
drot_kernel_16(n1, x, y, c, s);
|
||||
i=n1;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#include "dscal_microk_power8.c"
|
||||
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||
#include "dscal_microk_power10.c"
|
||||
#elif defined(POWER10)
|
||||
#include "dscal_microk_power8.c"
|
||||
#include "dscal_microk_power10.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
@ -104,7 +102,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
|||
if ( da == 0.0 )
|
||||
{
|
||||
|
||||
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||
#if defined(POWER10)
|
||||
if ( n >= 16 )
|
||||
{
|
||||
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;
|
||||
|
@ -138,7 +136,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
|||
else
|
||||
{
|
||||
|
||||
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||
#if defined(POWER10)
|
||||
if ( n >= 16 )
|
||||
{
|
||||
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;
|
||||
|
|
|
@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#include "dswap_microk_power8.c"
|
||||
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||
#include "swap_microk_power10.c"
|
||||
#elif defined(POWER10)
|
||||
#include "dswap_microk_power8.c"
|
||||
#include "swap_microk_power10.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
@ -119,7 +117,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
|
|||
if ( (inc_x == 1) && (inc_y == 1 ))
|
||||
{
|
||||
|
||||
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||
#if defined(POWER10)
|
||||
if ( n >= 32 )
|
||||
{
|
||||
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;
|
||||
|
|
|
@ -330,10 +330,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
|||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
#if defined(_CALL_ELF) && (_CALL_ELF == 2)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if (n1 > 0) {
|
||||
|
||||
max = diamax_kernel_32(n1, x, &maxf);
|
||||
|
|
|
@ -49,10 +49,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#include "sasum_microk_power8.c"
|
||||
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||
#include "sasum_microk_power10.c"
|
||||
#elif defined(POWER10)
|
||||
#include "sasum_microk_power8.c"
|
||||
#include "sasum_microk_power10.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
@ -114,7 +112,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
if ( inc_x == 1 )
|
||||
{
|
||||
|
||||
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||
#if defined(POWER10)
|
||||
if ( n >= 32 )
|
||||
{
|
||||
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7;
|
||||
|
|
|
@ -42,10 +42,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#include "srot_microk_power8.c"
|
||||
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||
#include "srot_microk_power10.c"
|
||||
#elif defined(POWER10)
|
||||
#include "srot_microk_power8.c"
|
||||
#include "srot_microk_power10.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
@ -119,7 +117,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
|||
if ( (inc_x == 1) && (inc_y == 1) )
|
||||
{
|
||||
|
||||
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||
#if defined(POWER10)
|
||||
if ( n >= 16 )
|
||||
{
|
||||
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7;
|
||||
|
|
|
@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#include "sscal_microk_power8.c"
|
||||
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||
#include "sscal_microk_power10.c"
|
||||
#elif defined(POWER10)
|
||||
#include "sscal_microk_power8.c"
|
||||
#include "sscal_microk_power10.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
@ -106,7 +104,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
|||
if ( da == 0.0 )
|
||||
{
|
||||
|
||||
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||
#if defined(POWER10)
|
||||
if ( n >= 32 )
|
||||
{
|
||||
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7;
|
||||
|
@ -140,7 +138,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
|||
else
|
||||
{
|
||||
|
||||
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||
#if defined(POWER10)
|
||||
if ( n >= 32 )
|
||||
{
|
||||
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7;
|
||||
|
|
|
@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#include "sswap_microk_power8.c"
|
||||
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||
#include "swap_microk_power10.c"
|
||||
#elif defined(POWER10)
|
||||
#include "sswap_microk_power8.c"
|
||||
#include "swap_microk_power10.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
@ -119,7 +117,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
|
|||
if ( (inc_x == 1) && (inc_y == 1 ))
|
||||
{
|
||||
|
||||
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||
#if defined(POWER10)
|
||||
if ( n >= 64 )
|
||||
{
|
||||
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7;
|
||||
|
|
|
@ -389,7 +389,6 @@ static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b,
|
|||
vector FLOAT *Vc6 = (vector FLOAT *) c6;
|
||||
vector FLOAT *Vc7 = (vector FLOAT *) c7;
|
||||
vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7;
|
||||
int j;
|
||||
|
||||
b[120] = (c0[15] *= a[255]);
|
||||
b[121] = (c1[15] *= a[255]);
|
||||
|
|
|
@ -390,7 +390,6 @@ static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b,
|
|||
vector FLOAT *Vc6 = (vector FLOAT *) c6;
|
||||
vector FLOAT *Vc7 = (vector FLOAT *) c7;
|
||||
vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7;
|
||||
int j;
|
||||
|
||||
b[0] = (c0[0] *= a[0]);
|
||||
b[1] = (c1[0] *= a[0]);
|
||||
|
|
|
@ -30,9 +30,17 @@ static void zaxpy_kernel_4 (long n, double *x, double *y,
|
|||
double alpha_r, double alpha_i)
|
||||
{
|
||||
#if !defined(CONJ)
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
static const double mvec[2] = { -1.0, 1.0 };
|
||||
#else
|
||||
static const double mvec[2] = { 1.0, -1.0 };
|
||||
#endif
|
||||
#else
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
static const double mvec[2] = { 1.0, -1.0 };
|
||||
#else
|
||||
static const double mvec[2] = { -1.0, 1.0 };
|
||||
#endif
|
||||
#endif
|
||||
const double *mvecp = mvec;
|
||||
|
||||
|
|
|
@ -147,13 +147,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
std r0, FLINK_SAVE(SP)
|
||||
|
||||
|
||||
#if defined(linux) || defined(__FreeBSD__)
|
||||
#if defined(linux) || defined(__FreeBSD__) || defined(_AIX)
|
||||
ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef TRMMKERNEL
|
||||
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
|
||||
#if (defined(linux) || defined(__FreeBSD__) || defined(_AIX)) && defined(__64BIT__)
|
||||
ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
|
||||
#endif
|
||||
#endif
|
||||
|
|
|
@ -41,23 +41,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#ifndef TRMMKERNEL
|
||||
lxv \VS_TEMP1, DISPX(\LOFFSET)(\REG)
|
||||
lxv \VS_TEMP2, DISPX(\LOFFSET+16)(\REG)
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxmrghd \VS_OUT1,\VS_TEMP1,\VS_TEMP2
|
||||
xxmrgld \VS_OUT2,\VS_TEMP1,\VS_TEMP2
|
||||
#else
|
||||
xxmrgld \VS_OUT1,\VS_TEMP1,\VS_TEMP2
|
||||
xxmrghd \VS_OUT2,\VS_TEMP1,\VS_TEMP2
|
||||
#endif
|
||||
#endif
|
||||
.endm
|
||||
/*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/
|
||||
|
||||
|
||||
.macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxmrghd \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/
|
||||
xxmrgld \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/
|
||||
#else
|
||||
xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/
|
||||
xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/
|
||||
#endif
|
||||
.endm
|
||||
/*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/
|
||||
|
||||
|
||||
.macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxmrghd \VSOUT1, \VSIN1,\VSIN2 /* real*imag */
|
||||
xxmrgld \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/
|
||||
#else
|
||||
xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*imag */
|
||||
xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/
|
||||
#endif
|
||||
.endm
|
||||
/* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/
|
||||
|
||||
|
@ -103,8 +118,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
|
||||
.macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxmrghd \VSOUT1,\VSIN1,\VSIN2
|
||||
xxmrgld \VSOUT2,\VSIN1,\VSIN2
|
||||
#else
|
||||
xxmrghd \VSOUT1,\VSIN2,\VSIN1
|
||||
xxmrgld \VSOUT2,\VSIN2,\VSIN1
|
||||
#endif
|
||||
.endm
|
||||
|
||||
|
||||
|
@ -186,15 +206,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs34,vs35
|
||||
#ifndef TRMMKERNEL
|
||||
lxv vs50, (\LOFFSET)(\BASE_REG)
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxmrghd vs46,vs50,vs50
|
||||
xxmrgld vs47,vs50,vs50
|
||||
#else
|
||||
xxmrgld vs46,vs50,vs50
|
||||
xxmrghd vs47,vs50,vs50
|
||||
#endif
|
||||
#endif
|
||||
RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes2,vs36,vs37
|
||||
AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37
|
||||
MULT_APLHA_PART1 vs34,vs36, vs46,vs47
|
||||
MULT_APLHA_PART2 vs34,vs36, vs46,vs47
|
||||
UNPACK_FOR_STORE vs46,vs47,vs39,vs41
|
||||
#if (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||
xxmrghd vs39,vs47,vs46
|
||||
#endif
|
||||
stxv vs39, (\LOFFSET)(\BASE_REG)
|
||||
.endm
|
||||
|
||||
|
@ -232,6 +259,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
lxvp vs44, DISP16(\Index,192)(AO) // load real,imag from A
|
||||
lxvp vs46, DISP16(\Index,224)(AO) // load real,imag from A
|
||||
lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xvf64gerpp 0, vs32, vs48
|
||||
xvf64gerpp 1, vs34, vs48
|
||||
xvf64gerpp 2, vs36, vs48
|
||||
xvf64gerpp 3, vs38, vs48
|
||||
xvf64gerpp 4, vs32, vs49
|
||||
xvf64gerpp 5, vs34, vs49
|
||||
xvf64gerpp 6, vs36, vs49
|
||||
xvf64gerpp 7, vs38, vs49
|
||||
#else
|
||||
xvf64gerpp 0, vs32, vs49
|
||||
xvf64gerpp 1, vs34, vs49
|
||||
xvf64gerpp 2, vs36, vs49
|
||||
|
@ -240,11 +277,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xvf64gerpp 5, vs34, vs48
|
||||
xvf64gerpp 6, vs36, vs48
|
||||
xvf64gerpp 7, vs38, vs48
|
||||
#endif
|
||||
lxvp vs32, DISP16(\Index, 256)(AO) // load real,imag from A
|
||||
lxvp vs34, DISP16(\Index, 288)(AO) // load real,imag from A
|
||||
lxvp vs36, DISP16(\Index, 320)(AO) // load real,imag from A
|
||||
lxvp vs38, DISP16(\Index, 352)(AO) // load real,imag from A
|
||||
lxvp vs48, DISP4(\Index, 64)(BO) // load real imag from B
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xvf64gerpp 0, vs40, vs50
|
||||
xvf64gerpp 1, vs42, vs50
|
||||
xvf64gerpp 2, vs44, vs50
|
||||
xvf64gerpp 3, vs46, vs50
|
||||
xvf64gerpp 4, vs40, vs51
|
||||
xvf64gerpp 5, vs42, vs51
|
||||
xvf64gerpp 6, vs44, vs51
|
||||
xvf64gerpp 7, vs46, vs51
|
||||
#else
|
||||
xvf64gerpp 0, vs40, vs51
|
||||
xvf64gerpp 1, vs42, vs51
|
||||
xvf64gerpp 2, vs44, vs51
|
||||
|
@ -253,6 +301,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xvf64gerpp 5, vs42, vs50
|
||||
xvf64gerpp 6, vs44, vs50
|
||||
xvf64gerpp 7, vs46, vs50
|
||||
#endif
|
||||
.if \IsLast==1
|
||||
addi AO, AO, DISP16(\Index,256)
|
||||
addi BO, BO, DISP4(\Index,64)
|
||||
|
@ -261,6 +310,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
|
||||
.macro LOAD_END_2x8 OffsetA,OffsetB
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xvf64gerpp 0, vs32, vs48
|
||||
xvf64gerpp 1, vs34, vs48
|
||||
xvf64gerpp 2, vs36, vs48
|
||||
xvf64gerpp 3, vs38, vs48
|
||||
xvf64gerpp 4, vs32, vs49
|
||||
xvf64gerpp 5, vs34, vs49
|
||||
xvf64gerpp 6, vs36, vs49
|
||||
xvf64gerpp 7, vs38, vs49
|
||||
#else
|
||||
xvf64gerpp 0, vs32, vs49
|
||||
xvf64gerpp 1, vs34, vs49
|
||||
xvf64gerpp 2, vs36, vs49
|
||||
|
@ -269,6 +328,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xvf64gerpp 5, vs34, vs48
|
||||
xvf64gerpp 6, vs36, vs48
|
||||
xvf64gerpp 7, vs38, vs48
|
||||
#endif
|
||||
addi BO, BO, \OffsetB
|
||||
addi AO, AO, \OffsetA
|
||||
.endm
|
||||
|
@ -305,7 +365,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xxpermdi vs45, vs12, vs13, 0b10
|
||||
xxpermdi vs46, vs14, vs15, 0b01
|
||||
xxpermdi vs47, vs14, vs15, 0b10
|
||||
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxlor vs0, vs32, vs32
|
||||
xxlor vs1, vs33, vs33
|
||||
xxlor vs2, vs34, vs34
|
||||
xxlor vs3, vs35, vs35
|
||||
xxlor vs4, vs36, vs36
|
||||
xxlor vs5, vs37, vs37
|
||||
xxlor vs6, vs38, vs38
|
||||
xxlor vs7, vs39, vs39
|
||||
xxlor vs8, vs40, vs40
|
||||
xxlor vs9, vs41, vs41
|
||||
xxlor vs10, vs42, vs42
|
||||
xxlor vs11, vs43, vs43
|
||||
xxlor vs12, vs44, vs44
|
||||
xxlor vs13, vs45, vs45
|
||||
xxlor vs14, vs46, vs46
|
||||
xxlor vs15, vs47, vs47
|
||||
#else
|
||||
xxlor vs2, vs32, vs32
|
||||
xxlor vs3, vs33, vs33
|
||||
xxlor vs0, vs34, vs34
|
||||
|
@ -322,7 +399,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xxlor vs15, vs45, vs45
|
||||
xxlor vs12, vs46, vs46
|
||||
xxlor vs13, vs47, vs47
|
||||
|
||||
#endif
|
||||
xxpermdi vs32, vs16, vs17, 0b01
|
||||
xxpermdi vs33, vs16, vs17, 0b10
|
||||
xxpermdi vs34, vs18, vs19, 0b01
|
||||
|
@ -339,7 +416,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xxpermdi vs45, vs28, vs29, 0b10
|
||||
xxpermdi vs46, vs30, vs31, 0b01
|
||||
xxpermdi vs47, vs30, vs31, 0b10
|
||||
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxlor vs16, vs32, vs32
|
||||
xxlor vs17, vs33, vs33
|
||||
xxlor vs18, vs34, vs34
|
||||
xxlor vs19, vs35, vs35
|
||||
xxlor vs20, vs36, vs36
|
||||
xxlor vs21, vs37, vs37
|
||||
xxlor vs22, vs38, vs38
|
||||
xxlor vs23, vs39, vs39
|
||||
xxlor vs24, vs40, vs40
|
||||
xxlor vs25, vs41, vs41
|
||||
xxlor vs26, vs42, vs42
|
||||
xxlor vs27, vs43, vs43
|
||||
xxlor vs28, vs44, vs44
|
||||
xxlor vs29, vs45, vs45
|
||||
xxlor vs30, vs46, vs46
|
||||
xxlor vs31, vs47, vs47
|
||||
#else
|
||||
xxlor vs18, vs32, vs32
|
||||
xxlor vs19, vs33, vs33
|
||||
xxlor vs16, vs34, vs34
|
||||
|
@ -356,7 +450,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xxlor vs31, vs45, vs45
|
||||
xxlor vs28, vs46, vs46
|
||||
xxlor vs29, vs47, vs47
|
||||
|
||||
#endif
|
||||
SAVE8 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0
|
||||
SAVE8 vs16,vs17,vs18,vs19,vs20,vs21,vs22,vs23,vs24,vs25,vs26,vs27,vs28,vs29,vs30,vs31,T1,0
|
||||
addi CO, CO, 128
|
||||
|
@ -388,17 +482,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
lxvp vs40, DISP8(\Index, 64)(AO) // load real,imag from A
|
||||
lxvp vs42, DISP8(\Index, 96)(AO) // load real,imag from A
|
||||
lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xvf64gerpp 0, vs32, vs48
|
||||
xvf64gerpp 1, vs34, vs48
|
||||
xvf64gerpp 2, vs32, vs49
|
||||
xvf64gerpp 3, vs34, vs49
|
||||
#else
|
||||
xvf64gerpp 0, vs32, vs49
|
||||
xvf64gerpp 1, vs34, vs49
|
||||
xvf64gerpp 2, vs32, vs48
|
||||
xvf64gerpp 3, vs34, vs48
|
||||
#endif
|
||||
lxvp vs32, DISP8(\Index, 128)(AO) // load real,imag from A
|
||||
lxvp vs34, DISP8(\Index, 160)(AO) // load real,imag from A
|
||||
lxvp vs48, DISP4(\Index, 64)(BO) // load real,imag from B
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xvf64gerpp 0, vs40, vs50
|
||||
xvf64gerpp 1, vs42, vs50
|
||||
xvf64gerpp 2, vs40, vs51
|
||||
xvf64gerpp 3, vs42, vs51
|
||||
#else
|
||||
xvf64gerpp 0, vs40, vs51
|
||||
xvf64gerpp 1, vs42, vs51
|
||||
xvf64gerpp 2, vs40, vs50
|
||||
xvf64gerpp 3, vs42, vs50
|
||||
#endif
|
||||
.if \IsLast==1
|
||||
addi AO, AO, DISP8(\Index,128)
|
||||
addi BO, BO, DISP4(\Index,64)
|
||||
|
@ -407,10 +515,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
|
||||
.macro LOAD_END_2x4 OffsetA, OffsetB
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xvf64gerpp 0, vs32, vs48
|
||||
xvf64gerpp 1, vs34, vs48
|
||||
xvf64gerpp 2, vs32, vs49
|
||||
xvf64gerpp 3, vs34, vs49
|
||||
#else
|
||||
xvf64gerpp 0, vs32, vs49
|
||||
xvf64gerpp 1, vs34, vs49
|
||||
xvf64gerpp 2, vs32, vs48
|
||||
xvf64gerpp 3, vs34, vs48
|
||||
#endif
|
||||
addi BO, BO, \OffsetB
|
||||
addi AO, AO, \OffsetA
|
||||
.endm
|
||||
|
@ -443,7 +558,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xxpermdi vs45, vs12, vs13, 0b10
|
||||
xxpermdi vs46, vs14, vs15, 0b01
|
||||
xxpermdi vs47, vs14, vs15, 0b10
|
||||
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxlor vs0, vs32, vs32
|
||||
xxlor vs1, vs33, vs33
|
||||
xxlor vs2, vs34, vs34
|
||||
xxlor vs3, vs35, vs35
|
||||
xxlor vs4, vs36, vs36
|
||||
xxlor vs5, vs37, vs37
|
||||
xxlor vs6, vs38, vs38
|
||||
xxlor vs7, vs39, vs39
|
||||
xxlor vs8, vs40, vs40
|
||||
xxlor vs9, vs41, vs41
|
||||
xxlor vs10, vs42, vs42
|
||||
xxlor vs11, vs43, vs43
|
||||
xxlor vs12, vs44, vs44
|
||||
xxlor vs13, vs45, vs45
|
||||
xxlor vs14, vs46, vs46
|
||||
xxlor vs15, vs47, vs47
|
||||
#else
|
||||
xxlor vs2, vs32, vs32
|
||||
xxlor vs3, vs33, vs33
|
||||
xxlor vs0, vs34, vs34
|
||||
|
@ -460,7 +592,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xxlor vs15, vs45, vs45
|
||||
xxlor vs12, vs46, vs46
|
||||
xxlor vs13, vs47, vs47
|
||||
|
||||
#endif
|
||||
SAVE4 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0
|
||||
SAVE4 vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,T1,0
|
||||
addi CO, CO, 64
|
||||
|
@ -488,12 +620,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.macro KERNEL2x2_2 Index, IsLast
|
||||
lxvp vs40, DISP4(\Index, 32)(AO) // load real,imag from A
|
||||
lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xvf64gerpp 0, vs32, vs48
|
||||
xvf64gerpp 1, vs32, vs49
|
||||
#else
|
||||
xvf64gerpp 0, vs32, vs49
|
||||
xvf64gerpp 1, vs32, vs48
|
||||
#endif
|
||||
lxvp vs32, DISP4(\Index, 64)(AO) // load real,imag from A
|
||||
lxvp vs48, DISP4(\Index, 64)(BO) // load real imag from B
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xvf64gerpp 0, vs40, vs50
|
||||
xvf64gerpp 1, vs40, vs51
|
||||
#else
|
||||
xvf64gerpp 0, vs40, vs51
|
||||
xvf64gerpp 1, vs40, vs50
|
||||
#endif
|
||||
.if \IsLast==1
|
||||
addi AO, AO, DISP4(\Index,64)
|
||||
addi BO, BO, DISP4(\Index,64)
|
||||
|
@ -502,8 +644,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
|
||||
.macro LOAD_END_2x2 OffsetA,OffsetB
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xvf64gerpp 0, vs32, vs48
|
||||
xvf64gerpp 1, vs32, vs49
|
||||
#else
|
||||
xvf64gerpp 0, vs32, vs49
|
||||
xvf64gerpp 1, vs32, vs48
|
||||
#endif
|
||||
addi BO, BO, \OffsetB
|
||||
addi AO, AO, \OffsetA
|
||||
.endm
|
||||
|
@ -526,7 +673,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xxpermdi vs37, vs4, vs5, 0b10
|
||||
xxpermdi vs38, vs6, vs7, 0b01
|
||||
xxpermdi vs39, vs6, vs7, 0b10
|
||||
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxlor vs0, vs32, vs32
|
||||
xxlor vs1, vs33, vs33
|
||||
xxlor vs2, vs34, vs34
|
||||
xxlor vs3, vs35, vs35
|
||||
xxlor vs4, vs36, vs36
|
||||
xxlor vs5, vs37, vs37
|
||||
xxlor vs6, vs38, vs38
|
||||
xxlor vs7, vs39, vs39
|
||||
#else
|
||||
xxlor vs2, vs32, vs32
|
||||
xxlor vs3, vs33, vs33
|
||||
xxlor vs0, vs34, vs34
|
||||
|
@ -535,7 +691,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xxlor vs7, vs37, vs37
|
||||
xxlor vs4, vs38, vs38
|
||||
xxlor vs5, vs39, vs39
|
||||
|
||||
#endif
|
||||
SAVE2 vs0,vs1,vs2,vs3,CO,0
|
||||
SAVE2 vs4,vs5,vs6,vs7,T1,0
|
||||
addi CO, CO, 32
|
||||
|
@ -702,6 +858,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
lxvp vs44, DISP16(\Index, 192)(AO) // load real,imag from A
|
||||
lxvp vs46, DISP16(\Index, 224)(AO) // load real,imag from A
|
||||
lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xvf64gerpp 0, vs32, vs48
|
||||
xvf64gerpp 1, vs34, vs48
|
||||
xvf64gerpp 2, vs36, vs48
|
||||
xvf64gerpp 3, vs38, vs48
|
||||
xvf64gerpp 0, vs40, vs49
|
||||
xvf64gerpp 1, vs42, vs49
|
||||
xvf64gerpp 2, vs44, vs49
|
||||
xvf64gerpp 3, vs46, vs49
|
||||
#else
|
||||
xvf64gerpp 0, vs32, vs49
|
||||
xvf64gerpp 1, vs34, vs49
|
||||
xvf64gerpp 2, vs36, vs49
|
||||
|
@ -710,6 +876,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xvf64gerpp 1, vs42, vs48
|
||||
xvf64gerpp 2, vs44, vs48
|
||||
xvf64gerpp 3, vs46, vs48
|
||||
#endif
|
||||
.if \IsLast==1
|
||||
addi AO, AO, DISP16(\Index,256)
|
||||
addi BO, BO, DISP2(\Index,32)
|
||||
|
@ -758,7 +925,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xxpermdi vs45, vs12, vs13, 0b10
|
||||
xxpermdi vs46, vs14, vs15, 0b01
|
||||
xxpermdi vs47, vs14, vs15, 0b10
|
||||
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxlor vs0, vs32, vs32
|
||||
xxlor vs1, vs33, vs33
|
||||
xxlor vs2, vs34, vs34
|
||||
xxlor vs3, vs35, vs35
|
||||
xxlor vs4, vs36, vs36
|
||||
xxlor vs5, vs37, vs37
|
||||
xxlor vs6, vs38, vs38
|
||||
xxlor vs7, vs39, vs39
|
||||
xxlor vs8, vs40, vs40
|
||||
xxlor vs9, vs41, vs41
|
||||
xxlor vs10, vs42, vs42
|
||||
xxlor vs11, vs43, vs43
|
||||
xxlor vs12, vs44, vs44
|
||||
xxlor vs13, vs45, vs45
|
||||
xxlor vs14, vs46, vs46
|
||||
xxlor vs15, vs47, vs47
|
||||
#else
|
||||
xxlor vs2, vs32, vs32
|
||||
xxlor vs3, vs33, vs33
|
||||
xxlor vs0, vs34, vs34
|
||||
|
@ -775,7 +959,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xxlor vs15, vs45, vs45
|
||||
xxlor vs12, vs46, vs46
|
||||
xxlor vs13, vs47, vs47
|
||||
|
||||
#endif
|
||||
SAVE8 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0
|
||||
addi CO, CO, 128
|
||||
.endm
|
||||
|
@ -799,10 +983,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
lxvp vs40, DISP8(\Index, 64)(AO) // load real,imag from A
|
||||
lxvp vs42, DISP8(\Index, 96)(AO) // load real,imag from A
|
||||
lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xvf64gerpp 0, vs32, vs48
|
||||
xvf64gerpp 1, vs34, vs48
|
||||
xvf64gerpp 0, vs40, vs49
|
||||
xvf64gerpp 1, vs42, vs49
|
||||
#else
|
||||
xvf64gerpp 0, vs32, vs49
|
||||
xvf64gerpp 1, vs34, vs49
|
||||
xvf64gerpp 0, vs40, vs48
|
||||
xvf64gerpp 1, vs42, vs48
|
||||
#endif
|
||||
.if \IsLast==1
|
||||
addi AO, AO, DISP8(\Index,128)
|
||||
addi BO, BO, DISP2(\Index,32)
|
||||
|
@ -837,7 +1028,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xxpermdi vs37, vs4, vs5, 0b10
|
||||
xxpermdi vs38, vs6, vs7, 0b01
|
||||
xxpermdi vs39, vs6, vs7, 0b10
|
||||
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxlor vs0, vs32, vs32
|
||||
xxlor vs1, vs33, vs33
|
||||
xxlor vs2, vs34, vs34
|
||||
xxlor vs3, vs35, vs35
|
||||
xxlor vs4, vs36, vs36
|
||||
xxlor vs5, vs37, vs37
|
||||
xxlor vs6, vs38, vs38
|
||||
xxlor vs7, vs39, vs39
|
||||
#else
|
||||
xxlor vs2, vs32, vs32
|
||||
xxlor vs3, vs33, vs33
|
||||
xxlor vs0, vs34, vs34
|
||||
|
@ -846,7 +1046,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xxlor vs7, vs37, vs37
|
||||
xxlor vs4, vs38, vs38
|
||||
xxlor vs5, vs39, vs39
|
||||
|
||||
#endif
|
||||
SAVE4 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0
|
||||
addi CO, CO, 64
|
||||
.endm
|
||||
|
@ -867,8 +1067,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
lxvp vs32, DISP4(\Index, 0)(AO) // load real,imag from A
|
||||
lxvp vs40, DISP4(\Index, 32)(AO) // load real,imag from A
|
||||
lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xvf64gerpp 0, vs32, vs48
|
||||
xvf64gerpp 0, vs40, vs49
|
||||
#else
|
||||
xvf64gerpp 0, vs32, vs49
|
||||
xvf64gerpp 0, vs40, vs48
|
||||
#endif
|
||||
.if \IsLast==1
|
||||
addi AO, AO, DISP4(\Index,64)
|
||||
addi BO, BO, DISP2(\Index,32)
|
||||
|
@ -896,11 +1101,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xxpermdi vs33, vs0, vs1, 0b10
|
||||
xxpermdi vs34, vs2, vs3, 0b01
|
||||
xxpermdi vs35, vs2, vs3, 0b10
|
||||
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
xxlor vs0, vs32, vs32
|
||||
xxlor vs1, vs33, vs33
|
||||
xxlor vs2, vs34, vs34
|
||||
xxlor vs3, vs35, vs35
|
||||
#else
|
||||
xxlor vs2, vs32, vs32
|
||||
xxlor vs3, vs33, vs33
|
||||
xxlor vs0, vs34, vs34
|
||||
xxlor vs1, vs35, vs35
|
||||
#endif
|
||||
|
||||
SAVE2 vs0,vs1,vs2,vs3,CO,0
|
||||
addi CO, CO, 32
|
||||
|
|
|
@ -607,7 +607,6 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT
|
|||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) {
|
||||
BLASLONG i;
|
||||
BLASLONG j;
|
||||
FLOAT *a_ptr;
|
||||
FLOAT *x_ptr;
|
||||
FLOAT *y_ptr;
|
||||
|
|
|
@ -738,7 +738,6 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT
|
|||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) {
|
||||
BLASLONG i;
|
||||
BLASLONG j;
|
||||
FLOAT *a_ptr;
|
||||
FLOAT *x_ptr;
|
||||
FLOAT *y_ptr;
|
||||
|
|
|
@ -43,7 +43,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#elif HAVE_KERNEL_4x4_VEC
|
||||
|
||||
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||
#if defined(POWER10)
|
||||
typedef __vector unsigned char vec_t;
|
||||
typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
|
||||
|
||||
|
|
|
@ -43,16 +43,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#if defined(DOUBLE)
|
||||
#include "zscal_microk_power8.c"
|
||||
#endif
|
||||
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||
#elif defined(POWER10)
|
||||
#if defined(DOUBLE)
|
||||
#include "zscal_microk_power10.c"
|
||||
#else
|
||||
#include "cscal_microk_power10.c"
|
||||
#endif
|
||||
#elif defined(POWER10)
|
||||
#if defined(DOUBLE)
|
||||
#include "zscal_microk_power8.c"
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
|
|
@ -42,7 +42,11 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
|
|||
|
||||
"xsnegdp 33, %x10 \n\t" // -alpha_i
|
||||
XXSPLTD_S(32,%x9,0) // alpha_r , alpha_r
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
XXMRGHD_S(33,33, %x10) // -alpha_i , alpha_i
|
||||
#else
|
||||
XXMRGHD_S(33,%x10, 33) // -alpha_i , alpha_i
|
||||
#endif
|
||||
|
||||
"lxvp 40, 0(%2) \n\t"
|
||||
"lxvp 42, 32(%2) \n\t"
|
||||
|
@ -97,10 +101,17 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
|
|||
"xvadddp 49, 49, 39 \n\t"
|
||||
"xvadddp 50, 50, %x3 \n\t"
|
||||
"xvadddp 51, 51, %x4 \n\t"
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"stxv 48, 0(%2) \n\t"
|
||||
"stxv 49, 16(%2) \n\t"
|
||||
"stxv 50, 32(%2) \n\t"
|
||||
"stxv 51, 48(%2) \n\t"
|
||||
#else
|
||||
"stxv 49, 0(%2) \n\t"
|
||||
"stxv 48, 16(%2) \n\t"
|
||||
"stxv 51, 32(%2) \n\t"
|
||||
"stxv 50, 48(%2) \n\t"
|
||||
#endif
|
||||
|
||||
|
||||
"xvadddp 34, 34, %x5 \n\t"
|
||||
|
@ -109,12 +120,17 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
|
|||
|
||||
"xvadddp 36, 36, %x7 \n\t"
|
||||
"xvadddp 37, 37, %x8 \n\t"
|
||||
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"stxv 34, 64(%2) \n\t"
|
||||
"stxv 35, 80(%2) \n\t"
|
||||
"stxv 36, 96(%2) \n\t"
|
||||
"stxv 37, 112(%2) \n\t"
|
||||
#else
|
||||
"stxv 35, 64(%2) \n\t"
|
||||
"stxv 34, 80(%2) \n\t"
|
||||
"stxv 37, 96(%2) \n\t"
|
||||
"stxv 36, 112(%2) \n\t"
|
||||
|
||||
#endif
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"addic. %1, %1, -8 \n\t"
|
||||
|
@ -155,23 +171,34 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
|
|||
|
||||
"xvadddp 50, 50, %x3 \n\t"
|
||||
"xvadddp 51, 51, %x4 \n\t"
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"stxv 48, 0(%2) \n\t"
|
||||
"stxv 49, 16(%2) \n\t"
|
||||
"stxv 50, 32(%2) \n\t"
|
||||
"stxv 51, 48(%2) \n\t"
|
||||
#else
|
||||
"stxv 49, 0(%2) \n\t"
|
||||
"stxv 48, 16(%2) \n\t"
|
||||
"stxv 51, 32(%2) \n\t"
|
||||
"stxv 50, 48(%2) \n\t"
|
||||
|
||||
#endif
|
||||
"xvadddp 34, 34, %x5 \n\t"
|
||||
"xvadddp 35, 35, %x6 \n\t"
|
||||
|
||||
|
||||
"xvadddp 36, 36, %x7 \n\t"
|
||||
"xvadddp 37, 37, %x8 \n\t"
|
||||
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"stxv 34, 64(%2) \n\t"
|
||||
"stxv 35, 80(%2) \n\t"
|
||||
"stxv 36, 96(%2) \n\t"
|
||||
"stxv 37, 112(%2) \n\t"
|
||||
#else
|
||||
"stxv 35, 64(%2) \n\t"
|
||||
"stxv 34, 80(%2) \n\t"
|
||||
"stxv 37, 96(%2) \n\t"
|
||||
"stxv 36, 112(%2) \n\t"
|
||||
|
||||
#endif
|
||||
"#n=%1 x=%0=%2 alpha=(%9,%10) \n"
|
||||
:
|
||||
"+m" (*x),
|
||||
|
|
|
@ -39,10 +39,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#include "zswap_microk_power8.c"
|
||||
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
||||
#include "cswap_microk_power10.c"
|
||||
#elif defined(POWER10)
|
||||
#include "zswap_microk_power8.c"
|
||||
#include "cswap_microk_power10.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
|
|
@ -9,3 +9,14 @@ SBGEMM_SMALL_K_TN = sbgemm_small_kernel_tn_cooperlake.c
|
|||
SBGEMM_SMALL_K_B0_TN = sbgemm_small_kernel_tn_cooperlake.c
|
||||
SBGEMM_SMALL_K_TT = sbgemm_small_kernel_tt_cooperlake.c
|
||||
SBGEMM_SMALL_K_B0_TT = sbgemm_small_kernel_tt_cooperlake.c
|
||||
|
||||
SBGEMM_BETA = sgemm_beta_skylakex.c
|
||||
SBGEMMKERNEL = sbgemm_kernel_16x4_cooperlake.c
|
||||
SBGEMMINCOPY = sbgemm_ncopy_16_cooperlake.c
|
||||
SBGEMMITCOPY = sbgemm_tcopy_16_cooperlake.c
|
||||
SBGEMMONCOPY = sbgemm_ncopy_4_cooperlake.c
|
||||
SBGEMMOTCOPY = sbgemm_tcopy_4_cooperlake.c
|
||||
SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
|
|
@ -56,25 +56,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
|
||||
#define BF16_MATRIX_LOAD_8x16(regArray, a, lda, idx_m, idx_n) \
|
||||
regArray##_0 = _mm256_loadu_si256(&a[(idx_m+0)*lda + idx_n]); \
|
||||
regArray##_1 = _mm256_loadu_si256(&a[(idx_m+1)*lda + idx_n]); \
|
||||
regArray##_2 = _mm256_loadu_si256(&a[(idx_m+2)*lda + idx_n]); \
|
||||
regArray##_3 = _mm256_loadu_si256(&a[(idx_m+3)*lda + idx_n]); \
|
||||
regArray##_4 = _mm256_loadu_si256(&a[(idx_m+4)*lda + idx_n]); \
|
||||
regArray##_5 = _mm256_loadu_si256(&a[(idx_m+5)*lda + idx_n]); \
|
||||
regArray##_6 = _mm256_loadu_si256(&a[(idx_m+6)*lda + idx_n]); \
|
||||
regArray##_7 = _mm256_loadu_si256(&a[(idx_m+7)*lda + idx_n]);
|
||||
regArray##_0 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+0)*lda + idx_n])); \
|
||||
regArray##_1 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+1)*lda + idx_n])); \
|
||||
regArray##_2 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+2)*lda + idx_n])); \
|
||||
regArray##_3 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+3)*lda + idx_n])); \
|
||||
regArray##_4 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+4)*lda + idx_n])); \
|
||||
regArray##_5 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+5)*lda + idx_n])); \
|
||||
regArray##_6 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+6)*lda + idx_n])); \
|
||||
regArray##_7 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+7)*lda + idx_n]));
|
||||
|
||||
|
||||
#define BF16_MATRIX_LOAD_8x8(regArray, a, lda, idx_m, idx_n) \
|
||||
regArray##_0 = _mm_loadu_si128(&a[(idx_m+0)*lda + idx_n]); \
|
||||
regArray##_1 = _mm_loadu_si128(&a[(idx_m+1)*lda + idx_n]); \
|
||||
regArray##_2 = _mm_loadu_si128(&a[(idx_m+2)*lda + idx_n]); \
|
||||
regArray##_3 = _mm_loadu_si128(&a[(idx_m+3)*lda + idx_n]); \
|
||||
regArray##_4 = _mm_loadu_si128(&a[(idx_m+4)*lda + idx_n]); \
|
||||
regArray##_5 = _mm_loadu_si128(&a[(idx_m+5)*lda + idx_n]); \
|
||||
regArray##_6 = _mm_loadu_si128(&a[(idx_m+6)*lda + idx_n]); \
|
||||
regArray##_7 = _mm_loadu_si128(&a[(idx_m+7)*lda + idx_n]);
|
||||
regArray##_0 = _mm_loadu_si128((__m128i *)(&a[(idx_m+0)*lda + idx_n])); \
|
||||
regArray##_1 = _mm_loadu_si128((__m128i *)(&a[(idx_m+1)*lda + idx_n])); \
|
||||
regArray##_2 = _mm_loadu_si128((__m128i *)(&a[(idx_m+2)*lda + idx_n])); \
|
||||
regArray##_3 = _mm_loadu_si128((__m128i *)(&a[(idx_m+3)*lda + idx_n])); \
|
||||
regArray##_4 = _mm_loadu_si128((__m128i *)(&a[(idx_m+4)*lda + idx_n])); \
|
||||
regArray##_5 = _mm_loadu_si128((__m128i *)(&a[(idx_m+5)*lda + idx_n])); \
|
||||
regArray##_6 = _mm_loadu_si128((__m128i *)(&a[(idx_m+6)*lda + idx_n])); \
|
||||
regArray##_7 = _mm_loadu_si128((__m128i *)(&a[(idx_m+7)*lda + idx_n]));
|
||||
|
||||
|
||||
#define BF16_MATRIX_LOAD_1x32(regArray, a, lda, idx_m, idx_n) \
|
||||
|
@ -153,11 +153,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
|
||||
#define BF16_VECTOR_LOAD_1x16(reg, x, idx_n) \
|
||||
reg = _mm256_loadu_si256(x + idx_n);
|
||||
reg = _mm256_loadu_si256((__m256i *)(x + idx_n));
|
||||
|
||||
|
||||
#define BF16_VECTOR_LOAD_1x8(reg, x, idx_n) \
|
||||
reg = _mm_loadu_si128(x + idx_n);
|
||||
reg = _mm_loadu_si128((__m128i *)(x + idx_n));
|
||||
|
||||
|
||||
#define BF16_VECTOR_MASKZ_LOAD_1x32(reg, x, idx_n, mask) \
|
||||
|
|
|
@ -15,7 +15,7 @@ static FLOAT casum_kernel(BLASLONG n, FLOAT *x)
|
|||
|
||||
if (n2 < 64) {
|
||||
__m128 accum_10, accum_11, accum_12, accum_13;
|
||||
__m128 abs_mask1;
|
||||
__m128 abs_mask1 = abs_mask1;
|
||||
|
||||
accum_10 = _mm_setzero_ps();
|
||||
accum_11 = _mm_setzero_ps();
|
||||
|
|
|
@ -38,10 +38,10 @@ static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1)
|
|||
|
||||
__m256i abs_mask = _mm256_set1_epi64x(0x7fffffffffffffff);
|
||||
for (i = 0; i < tail_index_AVX2; i += 16) {
|
||||
accum_0 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 0]), abs_mask);
|
||||
accum_1 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 4]), abs_mask);
|
||||
accum_2 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 8]), abs_mask);
|
||||
accum_3 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+12]), abs_mask);
|
||||
accum_0 += (__m256d)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 0]), abs_mask);
|
||||
accum_1 += (__m256d)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 4]), abs_mask);
|
||||
accum_2 += (__m256d)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 8]), abs_mask);
|
||||
accum_3 += (__m256d)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+12]), abs_mask);
|
||||
}
|
||||
|
||||
accum_0 = accum_0 + accum_1 + accum_2 + accum_3;
|
||||
|
@ -63,10 +63,10 @@ static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1)
|
|||
|
||||
__m128i abs_mask2 = _mm_set1_epi64x(0x7fffffffffffffff);
|
||||
for (i = tail_index_AVX2; i < tail_index_SSE; i += 8) {
|
||||
accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2);
|
||||
accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 2]), abs_mask2);
|
||||
accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2);
|
||||
accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 6]), abs_mask2);
|
||||
accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 0]), abs_mask2);
|
||||
accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 2]), abs_mask2);
|
||||
accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 4]), abs_mask2);
|
||||
accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 6]), abs_mask2);
|
||||
}
|
||||
|
||||
accum_20 = accum_20 + accum_21 + accum_22 + accum_23;
|
||||
|
|
|
@ -58,10 +58,10 @@ static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1)
|
|||
|
||||
__m128i abs_mask2 = _mm_set1_epi64x(0x7fffffffffffffff);
|
||||
for (i = tail_index_AVX512; i < tail_index_SSE; i += 8) {
|
||||
accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2);
|
||||
accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 2]), abs_mask2);
|
||||
accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2);
|
||||
accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 6]), abs_mask2);
|
||||
accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 0]), abs_mask2);
|
||||
accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 2]), abs_mask2);
|
||||
accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 4]), abs_mask2);
|
||||
accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 6]), abs_mask2);
|
||||
}
|
||||
|
||||
accum_20 = accum_20 + accum_21 + accum_22 + accum_23;
|
||||
|
|
|
@ -38,10 +38,10 @@ static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1)
|
|||
|
||||
__m256i abs_mask = _mm256_set1_epi32(0x7fffffff);
|
||||
for (i = 0; i < tail_index_AVX2; i += 32) {
|
||||
accum_0 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+ 0]), abs_mask);
|
||||
accum_1 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+ 8]), abs_mask);
|
||||
accum_2 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+16]), abs_mask);
|
||||
accum_3 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+24]), abs_mask);
|
||||
accum_0 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 0]), abs_mask);
|
||||
accum_1 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 8]), abs_mask);
|
||||
accum_2 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+16]), abs_mask);
|
||||
accum_3 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+24]), abs_mask);
|
||||
}
|
||||
|
||||
accum_0 = accum_0 + accum_1 + accum_2 + accum_3;
|
||||
|
@ -62,8 +62,8 @@ static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1)
|
|||
|
||||
__m128i abs_mask2 = _mm_set1_epi32(0x7fffffff);
|
||||
for (i = tail_index_AVX2; i < tail_index_SSE; i += 8) {
|
||||
accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2);
|
||||
accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2);
|
||||
accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 0]), abs_mask2);
|
||||
accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 4]), abs_mask2);
|
||||
}
|
||||
|
||||
accum_20 += accum_21;
|
||||
|
|
|
@ -53,8 +53,8 @@ static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1)
|
|||
|
||||
__m128i abs_mask2 = _mm_set1_epi32(0x7fffffff);
|
||||
for (i = tail_index_AVX512; i < tail_index_SSE; i += 8) {
|
||||
accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2);
|
||||
accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2);
|
||||
accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 0]), abs_mask2);
|
||||
accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 4]), abs_mask2);
|
||||
}
|
||||
|
||||
accum_20 += accum_21;
|
||||
|
|
|
@ -79,21 +79,21 @@ static float sbdot_accl_kernel(BLASLONG n, bfloat16 *x, bfloat16 *y)
|
|||
__m256 accum256_1 = _mm256_setzero_ps();
|
||||
int tail_index_32 = n&(~31);
|
||||
for (int j = 0; j < tail_index_32; j += 32) {
|
||||
accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256(&x[j+ 0]), (__m256bh) _mm256_loadu_si256(&y[j+ 0]));
|
||||
accum256_1 = _mm256_dpbf16_ps(accum256_1, (__m256bh) _mm256_loadu_si256(&x[j+16]), (__m256bh) _mm256_loadu_si256(&y[j+16]));
|
||||
accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256((__m256i *)&x[j+ 0]), (__m256bh) _mm256_loadu_si256((__m256i *)&y[j+ 0]));
|
||||
accum256_1 = _mm256_dpbf16_ps(accum256_1, (__m256bh) _mm256_loadu_si256((__m256i *)&x[j+16]), (__m256bh) _mm256_loadu_si256((__m256i *)&y[j+16]));
|
||||
}
|
||||
accum256 = _mm256_add_ps(accum256, accum256_1);
|
||||
|
||||
/* Processing the remaining <32 chunk with 16-elements processing */
|
||||
if ((n&16) != 0) {
|
||||
accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256(&x[tail_index_32]), (__m256bh) _mm256_loadu_si256(&y[tail_index_32]));
|
||||
accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256((__m256i *)&x[tail_index_32]), (__m256bh) _mm256_loadu_si256((__m256i *)&y[tail_index_32]));
|
||||
}
|
||||
accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf128_ps(accum256, 1));
|
||||
|
||||
/* Processing the remaining <16 chunk with 8-elements processing */
|
||||
if ((n&8) != 0) {
|
||||
int tail_index_16 = n&(~15);
|
||||
accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128(&x[tail_index_16]), (__m128bh) _mm_loadu_si128(&y[tail_index_16]));
|
||||
accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128((__m128i *)&x[tail_index_16]), (__m128bh) _mm_loadu_si128((__m128i *)&y[tail_index_16]));
|
||||
}
|
||||
|
||||
/* Processing the remaining <8 chunk with masked 8-elements processing */
|
||||
|
@ -108,13 +108,13 @@ static float sbdot_accl_kernel(BLASLONG n, bfloat16 *x, bfloat16 *y)
|
|||
} else if (n > 15) { /* n range from 16 to 31 */
|
||||
/* Processing <32 chunk with 16-elements processing */
|
||||
__m256 accum256 = _mm256_setzero_ps();
|
||||
accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256(&x[0]), (__m256bh) _mm256_loadu_si256(&y[0]));
|
||||
accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256((__m256i *)&x[0]), (__m256bh) _mm256_loadu_si256((__m256i *)&y[0]));
|
||||
accum128 += _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf128_ps(accum256, 1));
|
||||
|
||||
/* Processing the remaining <16 chunk with 8-elements processing */
|
||||
if ((n&8) != 0) {
|
||||
int tail_index_16 = n&(~15);
|
||||
accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128(&x[tail_index_16]), (__m128bh) _mm_loadu_si128(&y[tail_index_16]));
|
||||
accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128((__m128i *)&x[tail_index_16]), (__m128bh) _mm_loadu_si128((__m128i *)&y[tail_index_16]));
|
||||
}
|
||||
|
||||
/* Processing the remaining <8 chunk with masked 8-elements processing */
|
||||
|
@ -128,7 +128,7 @@ static float sbdot_accl_kernel(BLASLONG n, bfloat16 *x, bfloat16 *y)
|
|||
}
|
||||
} else if (n > 7) { /* n range from 8 to 15 */
|
||||
/* Processing <16 chunk with 8-elements processing */
|
||||
accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128(&x[0]), (__m128bh) _mm_loadu_si128(&y[0]));
|
||||
accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128((__m128i *)&x[0]), (__m128bh) _mm_loadu_si128((__m128i *)&y[0]));
|
||||
|
||||
/* Processing the remaining <8 chunk with masked 8-elements processing */
|
||||
if ((n&7) != 0) {
|
||||
|
|
|
@ -1246,7 +1246,7 @@ void COL_MAJOR_ITCOPY_KERNEL_Kx16(BLASLONG k, bfloat16 * A, BLASLONG lda, bfloat
|
|||
// K=Any number but will be processed based on 32, M<=16
|
||||
void COL_MAJOR_ITCOPY_KERNEL_Kx16m(BLASLONG m, BLASLONG k, bfloat16 * A, BLASLONG lda, bfloat16 * block_A)
|
||||
{
|
||||
bfloat16 * src_addr0, * src_addr1, * src_addr2, * src_addr3;
|
||||
bfloat16 * src_addr0;
|
||||
bfloat16 * dst_addr0, * dst_addr1;
|
||||
|
||||
BLASLONG tag_k_32x = k & (~31);
|
||||
|
|
|
@ -0,0 +1,499 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <immintrin.h>
|
||||
#include "common.h"
|
||||
|
||||
#define VMOVLDUP(addr, zmm) asm("vmovsldup (%1), %0": "=v"(zmm): "r"(addr))
|
||||
#define VMOVHDUP(addr, zmm) asm("vmovshdup (%1), %0": "=v"(zmm): "r"(addr))
|
||||
#define BROADCAST64(base, step, n, offset, zmm) \
|
||||
if (n == 0) asm("vbroadcastsd %c2(%1), %0": "=v"(zmm): "r"(base), "n"(offset*2)); \
|
||||
else asm("vbroadcastsd %c4(%1, %2, %c3), %0": "=v"(zmm): "r"(base), "r"(step), "n"(n*2), "n"(offset*2))
|
||||
|
||||
#define DECLARE_A_PAIR(A) \
|
||||
__m512i A_lo_##A; __m512i A_hi_##A;
|
||||
|
||||
#define LOAD_A_PAIR(A) \
|
||||
VMOVLDUP(ptr_a##A, A_lo_##A); \
|
||||
VMOVHDUP(ptr_a##A, A_hi_##A);
|
||||
|
||||
#define MASK_LOAD_A_PAIR(A) { \
|
||||
__m512 tmp = _mm512_maskz_loadu_ps(mmask, ptr_a##A); \
|
||||
A_lo_##A = (__m512i) _mm512_moveldup_ps(tmp); \
|
||||
A_hi_##A = (__m512i) _mm512_movehdup_ps(tmp); \
|
||||
}
|
||||
|
||||
#define LOAD_A_PAIR_TAIL(A) { \
|
||||
__m256i ymm = _mm256_loadu_si256((void *)ptr_a##A); \
|
||||
__m512 zmm = (__m512) _mm512_cvtepu16_epi32(ymm); \
|
||||
A_lo_##A = (__m512i) _mm512_moveldup_ps(zmm); \
|
||||
A_hi_##A = (__m512i) _mm512_movehdup_ps(zmm); \
|
||||
}
|
||||
|
||||
#define MASK_LOAD_A_PAIR_TAIL(A) { \
|
||||
__m256i ymm = _mm256_maskz_loadu_epi16(mmask, ptr_a##A); \
|
||||
__m512 zmm = (__m512) _mm512_cvtepu16_epi32(ymm); \
|
||||
A_lo_##A = (__m512i) _mm512_moveldup_ps(zmm); \
|
||||
A_hi_##A = (__m512i) _mm512_movehdup_ps(zmm); \
|
||||
}
|
||||
|
||||
#define DECLARE_B_PAIR() \
|
||||
__m512i B_lo; __m512i B_hi;
|
||||
|
||||
#define PREFETCH_B_STEP 32
|
||||
#define PREFETCH_B(Bx, By) \
|
||||
if (By == 0) asm("prefetcht0 %c1(%0)": : "r"(ptr_b##Bx), "n"(PREFETCH_B_STEP * 2)); \
|
||||
else asm("prefetcht0 %c3(%0, %1, %c2)": : "r"(ptr_b##Bx), "r"(n_blksize), "n"(By*2), "n"(PREFETCH_B_STEP * 2))
|
||||
|
||||
#define BROADCAST_B_PAIR(Bx, By) \
|
||||
BROADCAST64(ptr_b##Bx, n_blksize, By, 0, B_lo); \
|
||||
BROADCAST64(ptr_b##Bx, n_blksize, By, 4, B_hi);
|
||||
|
||||
#define MASK_BROADCAST_B_PAIR(Bx, x) {\
|
||||
__m128 xmm = _mm_maskz_loadu_ps(nmask, ptr_b##Bx); \
|
||||
B_lo = (__m512i) _mm512_broadcastsd_pd((__m128d) xmm); \
|
||||
B_hi = (__m512i) _mm512_broadcastsd_pd(_mm_permute_pd((__m128d) xmm, 0x1)); \
|
||||
}
|
||||
|
||||
#define BROADCAST_B_PAIR_TAIL(Bx, By) {\
|
||||
__m128i xmm = (__m128i) _mm_load_sd((double *)(ptr_b##Bx + n_blksize * By)); \
|
||||
xmm = _mm_cvtepu16_epi32(xmm); \
|
||||
B_lo = _mm512_broadcast_i32x2(xmm); \
|
||||
B_hi = _mm512_broadcast_i32x2((__m128i) _mm_permute_pd((__m128d) xmm, 0x1)); \
|
||||
}
|
||||
|
||||
#define MASK_BROADCAST_B_PAIR_TAIL(Bx, By) {\
|
||||
__m128i xmm = _mm_maskz_loadu_epi16(nmask, ptr_b##Bx + n_blksize * By); \
|
||||
xmm = _mm_cvtepu16_epi32(xmm); \
|
||||
B_lo = _mm512_broadcast_i32x2(xmm); \
|
||||
B_hi = _mm512_broadcast_i32x2((__m128i) _mm_permute_pd((__m128d) xmm, 0x1)); \
|
||||
}
|
||||
|
||||
#define DECLARE_RESULT_4X(A, Bx, By) \
|
||||
__m512 result_00_##A##Bx##By = _mm512_setzero_ps(); \
|
||||
__m512 result_01_##A##Bx##By = _mm512_setzero_ps(); \
|
||||
__m512 result_10_##A##Bx##By = _mm512_setzero_ps(); \
|
||||
__m512 result_11_##A##Bx##By = _mm512_setzero_ps();
|
||||
|
||||
#define FMA(a, b, r) r = _mm512_dpbf16_ps(r, (__m512bh)a, (__m512bh)b)
|
||||
|
||||
#define MATMUL_4X(A, Bx, By) \
|
||||
FMA(A_lo_##A, B_lo, result_00_##A##Bx##By); \
|
||||
FMA(A_hi_##A, B_lo, result_01_##A##Bx##By); \
|
||||
FMA(A_lo_##A, B_hi, result_10_##A##Bx##By); \
|
||||
FMA(A_hi_##A, B_hi, result_11_##A##Bx##By);
|
||||
|
||||
#define _STORE_C_2nx16(addr, val0, val1) \
|
||||
asm("vfmadd213ps (%1), %2, %0": "+v"(val0) : "r"(addr), "v"(alpha_512)); \
|
||||
asm("vfmadd213ps (%1, %3, 4), %2, %0": "+v"(val1) : "r"(addr), "v"(alpha_512), "r"(ldc)); \
|
||||
asm("vmovups %0, (%1)": : "v"(val0), "r"(addr)); \
|
||||
asm("vmovups %0, (%1, %2, 4)": : "v"(val1), "r"(addr), "r"(ldc))
|
||||
|
||||
#define _MASK_STORE_C_2nx16(addr, val0, val1) \
|
||||
asm("vfmadd213ps (%1), %2, %0 %{%3%} ": "+v"(val0) : "r"(addr), "v"(alpha_512), "k"(mmask)); \
|
||||
asm("vfmadd213ps (%1, %3, 4), %2, %0 %{%4%}": "+v"(val1) : "r"(addr), "v"(alpha_512), "r"(ldc), "k"(mmask)); \
|
||||
asm("vmovups %0, (%1) %{%2%}": : "v"(val0), "r"(addr), "k"(mmask)); \
|
||||
asm("vmovups %0, (%1, %2, 4) %{%3%}": : "v"(val1), "r"(addr), "r"(ldc), "k"(mmask))
|
||||
|
||||
#define _REORDER_C_2X(result_0, result_1) { \
|
||||
__m512 tmp0, tmp1; \
|
||||
tmp0 = _mm512_unpacklo_ps(result_0, result_1); \
|
||||
tmp1 = _mm512_unpackhi_ps(result_0, result_1); \
|
||||
result_0 = (__m512) _mm512_unpacklo_pd((__m512d) tmp0, (__m512d) tmp1); \
|
||||
result_1 = (__m512) _mm512_unpackhi_pd((__m512d) tmp0, (__m512d) tmp1); \
|
||||
}
|
||||
|
||||
#define _STORE_2X(ptr_c, result_0, result_1) {\
|
||||
_REORDER_C_2X(result_0, result_1) \
|
||||
_STORE_C_2nx16(ptr_c, result_0, result_1); \
|
||||
ptr_c += ldc * 2; \
|
||||
}
|
||||
|
||||
#define _MASK_STORE_2X(ptr_c, result_0, result_1) {\
|
||||
_REORDER_C_2X(result_0, result_1) \
|
||||
_MASK_STORE_C_2nx16(ptr_c, result_0, result_1); \
|
||||
ptr_c += ldc * 2; \
|
||||
}
|
||||
|
||||
#define STORE_4X(A, Bx, By) { \
|
||||
_STORE_2X(ptr_c##A, result_00_##A##Bx##By, result_01_##A##Bx##By); \
|
||||
_STORE_2X(ptr_c##A, result_10_##A##Bx##By, result_11_##A##Bx##By); \
|
||||
}
|
||||
|
||||
#define MASK_STORE_4X(A, Bx, By) { \
|
||||
_MASK_STORE_2X(ptr_c##A, result_00_##A##Bx##By, result_01_##A##Bx##By); \
|
||||
_MASK_STORE_2X(ptr_c##A, result_10_##A##Bx##By, result_11_##A##Bx##By); \
|
||||
}
|
||||
|
||||
#define _STORE_C_16(addr, val0) \
|
||||
asm("vfmadd213ps (%1), %2, %0": "+v"(val0) : "r"(addr), "v"(alpha_512)); \
|
||||
asm("vmovups %0, (%1)": : "v"(val0), "r"(addr));
|
||||
|
||||
#define _MASK_STORE_C_16(addr, val0) \
|
||||
asm("vfmadd213ps (%1), %2, %0 %{%3%} ": "+v"(val0) : "r"(addr), "v"(alpha_512), "k"(mmask)); \
|
||||
asm("vmovups %0, (%1) %{%2%}": : "v"(val0), "r"(addr), "k"(mmask));
|
||||
|
||||
#define N_STORE_4X(A, Bx, By) { \
|
||||
_REORDER_C_2X(result_00_##A##Bx##By, result_01_##A##Bx##By); \
|
||||
_REORDER_C_2X(result_10_##A##Bx##By, result_11_##A##Bx##By); \
|
||||
switch(n_count) { \
|
||||
case 3: _STORE_C_16(ptr_c + ldc * 2, result_10_##A##Bx##By); \
|
||||
case 2: _STORE_C_16(ptr_c + ldc * 1, result_01_##A##Bx##By); \
|
||||
case 1: _STORE_C_16(ptr_c + ldc * 0, result_00_##A##Bx##By); \
|
||||
} \
|
||||
ptr_c##A += ldc * n_count; \
|
||||
}
|
||||
|
||||
#define N_MASK_STORE_4X(A, Bx, By) { \
|
||||
_REORDER_C_2X(result_00_##A##Bx##By, result_01_##A##Bx##By); \
|
||||
_REORDER_C_2X(result_10_##A##Bx##By, result_11_##A##Bx##By); \
|
||||
switch(n_count) { \
|
||||
case 3: _MASK_STORE_C_16(ptr_c + ldc * 2, result_10_##A##Bx##By); \
|
||||
case 2: _MASK_STORE_C_16(ptr_c + ldc * 1, result_01_##A##Bx##By); \
|
||||
case 1: _MASK_STORE_C_16(ptr_c + ldc * 0, result_00_##A##Bx##By); \
|
||||
} \
|
||||
ptr_c##A += ldc * n_count; \
|
||||
}
|
||||
|
||||
|
||||
int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc)
|
||||
{
|
||||
IFLOAT *ptr_a = A, *ptr_b = B;
|
||||
IFLOAT *ptr_b0, *ptr_b1;
|
||||
IFLOAT *ptr_a0, *ptr_a1;
|
||||
FLOAT *ptr_c = C;
|
||||
FLOAT *ptr_c0, *ptr_c1;
|
||||
BLASLONG n_count = n;
|
||||
BLASLONG m_count, k_count;
|
||||
BLASLONG n_blksize = 4 * k;
|
||||
BLASLONG cn_offset = 0;
|
||||
__m512 alpha_512 = _mm512_broadcastss_ps(_mm_load_ss(&alpha));
|
||||
|
||||
for (; n_count > 23; n_count -= 24) {
|
||||
IFLOAT *ptr_b00 = ptr_b;
|
||||
IFLOAT *ptr_b10 = ptr_b + n_blksize * 3;
|
||||
ptr_a0 = ptr_a;
|
||||
ptr_c = C + cn_offset * ldc;
|
||||
m_count = m;
|
||||
for (; m_count > 15; m_count -= 16) {
|
||||
ptr_b0 = ptr_b00;
|
||||
ptr_b1 = ptr_b10;
|
||||
DECLARE_A_PAIR(0);
|
||||
DECLARE_B_PAIR();
|
||||
DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2);
|
||||
DECLARE_RESULT_4X(0, 1, 0); DECLARE_RESULT_4X(0, 1, 1); DECLARE_RESULT_4X(0, 1, 2);
|
||||
k_count = k;
|
||||
for (; k_count > 3; k_count -=4) {
|
||||
LOAD_A_PAIR(0);
|
||||
_mm_prefetch(ptr_a0 + 128, _MM_HINT_T0);
|
||||
ptr_a0 += 16 * 2;
|
||||
BROADCAST_B_PAIR(0, 0); PREFETCH_B(0, 0); MATMUL_4X(0, 0, 0);
|
||||
BROADCAST_B_PAIR(0, 1); PREFETCH_B(0, 1); MATMUL_4X(0, 0, 1);
|
||||
BROADCAST_B_PAIR(0, 2); PREFETCH_B(0, 2); MATMUL_4X(0, 0, 2);
|
||||
ptr_b0 += 4 * 2;
|
||||
BROADCAST_B_PAIR(1, 0); PREFETCH_B(1, 0); MATMUL_4X(0, 1, 0);
|
||||
BROADCAST_B_PAIR(1, 1); PREFETCH_B(1, 1); MATMUL_4X(0, 1, 1);
|
||||
BROADCAST_B_PAIR(1, 2); PREFETCH_B(1, 2); MATMUL_4X(0, 1, 2);
|
||||
ptr_b1 += 4 * 2;
|
||||
|
||||
LOAD_A_PAIR(0);
|
||||
_mm_prefetch(ptr_a0 + 128, _MM_HINT_T0);
|
||||
ptr_a0 += 16 * 2;
|
||||
BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0);
|
||||
BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1);
|
||||
BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2);
|
||||
ptr_b0 += 4 * 2;
|
||||
BROADCAST_B_PAIR(1, 0); MATMUL_4X(0, 1, 0);
|
||||
BROADCAST_B_PAIR(1, 1); MATMUL_4X(0, 1, 1);
|
||||
BROADCAST_B_PAIR(1, 2); MATMUL_4X(0, 1, 2);
|
||||
ptr_b1 += 4 * 2;
|
||||
}
|
||||
for (; k_count > 1; k_count -=2) {
|
||||
LOAD_A_PAIR(0);
|
||||
ptr_a0 += 16 * 2;
|
||||
BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0);
|
||||
BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1);
|
||||
BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2);
|
||||
ptr_b0 += 4 * 2;
|
||||
BROADCAST_B_PAIR(1, 0); MATMUL_4X(0, 1, 0);
|
||||
BROADCAST_B_PAIR(1, 1); MATMUL_4X(0, 1, 1);
|
||||
BROADCAST_B_PAIR(1, 2); MATMUL_4X(0, 1, 2);
|
||||
ptr_b1 += 4 * 2;
|
||||
}
|
||||
if (k_count > 0) {
|
||||
LOAD_A_PAIR_TAIL(0);
|
||||
ptr_a0 += 16;
|
||||
BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0);
|
||||
BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1);
|
||||
BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2);
|
||||
ptr_b0 += 4;
|
||||
BROADCAST_B_PAIR_TAIL(1, 0); MATMUL_4X(0, 1, 0);
|
||||
BROADCAST_B_PAIR_TAIL(1, 1); MATMUL_4X(0, 1, 1);
|
||||
BROADCAST_B_PAIR_TAIL(1, 2); MATMUL_4X(0, 1, 2);
|
||||
ptr_b1 += 4;
|
||||
}
|
||||
ptr_c0 = ptr_c;
|
||||
STORE_4X(0, 0, 0); STORE_4X(0, 0, 1); STORE_4X(0, 0, 2);
|
||||
STORE_4X(0, 1, 0); STORE_4X(0, 1, 1); STORE_4X(0, 1, 2);
|
||||
ptr_c += 16;
|
||||
}
|
||||
if (m_count > 0) {
|
||||
__mmask16 mmask = (1UL << m_count) - 1;
|
||||
ptr_b0 = ptr_b00;
|
||||
ptr_b1 = ptr_b10;
|
||||
DECLARE_A_PAIR(0);
|
||||
DECLARE_B_PAIR();
|
||||
DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2);
|
||||
DECLARE_RESULT_4X(0, 1, 0); DECLARE_RESULT_4X(0, 1, 1); DECLARE_RESULT_4X(0, 1, 2);
|
||||
for (k_count = k; k_count > 1; k_count -=2) {
|
||||
MASK_LOAD_A_PAIR(0);
|
||||
ptr_a0 += m_count * 2;
|
||||
BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0);
|
||||
BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1);
|
||||
BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2);
|
||||
ptr_b0 += 4 * 2;
|
||||
BROADCAST_B_PAIR(1, 0); MATMUL_4X(0, 1, 0);
|
||||
BROADCAST_B_PAIR(1, 1); MATMUL_4X(0, 1, 1);
|
||||
BROADCAST_B_PAIR(1, 2); MATMUL_4X(0, 1, 2);
|
||||
ptr_b1 += 4 * 2;
|
||||
}
|
||||
if (k_count > 0) {
|
||||
MASK_LOAD_A_PAIR_TAIL(0);
|
||||
ptr_a0 += m_count;
|
||||
BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0);
|
||||
BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1);
|
||||
BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2);
|
||||
ptr_b0 += 4;
|
||||
BROADCAST_B_PAIR_TAIL(1, 0); MATMUL_4X(0, 1, 0);
|
||||
BROADCAST_B_PAIR_TAIL(1, 1); MATMUL_4X(0, 1, 1);
|
||||
BROADCAST_B_PAIR_TAIL(1, 2); MATMUL_4X(0, 1, 2);
|
||||
ptr_b1 += 4;
|
||||
}
|
||||
ptr_c0 = ptr_c;
|
||||
MASK_STORE_4X(0, 0, 0); MASK_STORE_4X(0, 0, 1); MASK_STORE_4X(0, 0, 2);
|
||||
MASK_STORE_4X(0, 1, 0); MASK_STORE_4X(0, 1, 1); MASK_STORE_4X(0, 1, 2);
|
||||
ptr_c += m_count;
|
||||
}
|
||||
ptr_b += 24 * k;
|
||||
cn_offset += 24;
|
||||
}
|
||||
for (; n_count > 11; n_count -= 12) {
|
||||
IFLOAT *ptr_b00 = ptr_b;
|
||||
ptr_a0 = ptr_a;
|
||||
ptr_a1 = ptr_a + 16 * k;
|
||||
ptr_c = C + cn_offset * ldc;
|
||||
m_count = m;
|
||||
for (; m_count > 31; m_count -= 32) {
|
||||
ptr_b0 = ptr_b00;
|
||||
DECLARE_A_PAIR(0); DECLARE_A_PAIR(1);
|
||||
DECLARE_B_PAIR();
|
||||
DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2);
|
||||
DECLARE_RESULT_4X(1, 0, 0); DECLARE_RESULT_4X(1, 0, 1); DECLARE_RESULT_4X(1, 0, 2);
|
||||
for (k_count = k; k_count > 1; k_count -=2) {
|
||||
LOAD_A_PAIR(0); LOAD_A_PAIR(1);
|
||||
ptr_a0 += 16 * 2;
|
||||
ptr_a1 += 16 * 2;
|
||||
BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); MATMUL_4X(1, 0, 0);
|
||||
BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); MATMUL_4X(1, 0, 1);
|
||||
BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); MATMUL_4X(1, 0, 2);
|
||||
ptr_b0 += 4 * 2;
|
||||
}
|
||||
if (k_count > 0) {
|
||||
LOAD_A_PAIR_TAIL(0); LOAD_A_PAIR_TAIL(1);
|
||||
ptr_a0 += 16;
|
||||
ptr_a1 += 16;
|
||||
BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); MATMUL_4X(1, 0, 0);
|
||||
BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1); MATMUL_4X(1, 0, 1);
|
||||
BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2); MATMUL_4X(1, 0, 2);
|
||||
ptr_b0 += 4;
|
||||
}
|
||||
ptr_c0 = ptr_c;
|
||||
ptr_c1 = ptr_c + 16;
|
||||
STORE_4X(0, 0, 0); STORE_4X(1, 0, 0);
|
||||
STORE_4X(0, 0, 1); STORE_4X(1, 0, 1);
|
||||
STORE_4X(0, 0, 2); STORE_4X(1, 0, 2);
|
||||
ptr_c += 16 * 2;
|
||||
ptr_a0 = ptr_a1;
|
||||
ptr_a1 = ptr_a0 + 16 * k;
|
||||
}
|
||||
for (; m_count > 15; m_count -= 16) {
|
||||
ptr_b0 = ptr_b00;
|
||||
DECLARE_A_PAIR(0);
|
||||
DECLARE_B_PAIR();
|
||||
DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2);
|
||||
for (k_count = k; k_count > 1; k_count -=2) {
|
||||
LOAD_A_PAIR(0);
|
||||
ptr_a0 += 16 * 2;
|
||||
BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0);
|
||||
BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1);
|
||||
BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2);
|
||||
ptr_b0 += 4 * 2;
|
||||
}
|
||||
if (k_count > 0) {
|
||||
LOAD_A_PAIR_TAIL(0);
|
||||
ptr_a0 += 16;
|
||||
BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0);
|
||||
BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1);
|
||||
BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2);
|
||||
ptr_b0 += 4;
|
||||
}
|
||||
ptr_c0 = ptr_c;
|
||||
STORE_4X(0, 0, 0); STORE_4X(0, 0, 1); STORE_4X(0, 0, 2);
|
||||
ptr_c += 16;
|
||||
}
|
||||
if (m_count > 0) {
|
||||
__mmask16 mmask = (1UL << m_count) - 1;
|
||||
ptr_b0 = ptr_b00;
|
||||
DECLARE_A_PAIR(0);
|
||||
DECLARE_B_PAIR();
|
||||
DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2);
|
||||
for (k_count = k; k_count > 1; k_count -=2) {
|
||||
MASK_LOAD_A_PAIR(0);
|
||||
ptr_a0 += m_count * 2;
|
||||
BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0);
|
||||
BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1);
|
||||
BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2);
|
||||
ptr_b0 += 4 * 2;
|
||||
}
|
||||
if (k_count > 0) {
|
||||
MASK_LOAD_A_PAIR_TAIL(0);
|
||||
ptr_a0 += m_count;
|
||||
BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0);
|
||||
BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1);
|
||||
BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2);
|
||||
ptr_b0 += 4;
|
||||
}
|
||||
ptr_c0 = ptr_c;
|
||||
MASK_STORE_4X(0, 0, 0); MASK_STORE_4X(0, 0, 1); MASK_STORE_4X(0, 0, 2);
|
||||
ptr_c += m_count;
|
||||
}
|
||||
ptr_b += 12 * k;
|
||||
cn_offset += 12;
|
||||
}
|
||||
for (; n_count > 3; n_count -= 4) {
|
||||
IFLOAT *ptr_b00 = ptr_b;
|
||||
ptr_a0 = ptr_a;
|
||||
ptr_c = C + cn_offset * ldc;
|
||||
m_count = m;
|
||||
for (; m_count > 15; m_count -= 16) {
|
||||
ptr_b0 = ptr_b00;
|
||||
DECLARE_A_PAIR(0);
|
||||
DECLARE_B_PAIR();
|
||||
DECLARE_RESULT_4X(0, 0, 0);
|
||||
for (k_count = k; k_count > 1; k_count -=2) {
|
||||
LOAD_A_PAIR(0);
|
||||
BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0);
|
||||
ptr_b0 += 4 * 2;
|
||||
ptr_a0 += 16 * 2;
|
||||
}
|
||||
if (k_count > 0) {
|
||||
LOAD_A_PAIR_TAIL(0);
|
||||
BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0);
|
||||
ptr_b0 += 4;
|
||||
ptr_a0 += 16;
|
||||
}
|
||||
ptr_c0 = ptr_c;
|
||||
STORE_4X(0, 0, 0);
|
||||
ptr_c += 16;
|
||||
}
|
||||
if (m_count > 0) {
|
||||
__mmask16 mmask = (1UL << m_count) - 1;
|
||||
ptr_b0 = ptr_b00;
|
||||
DECLARE_A_PAIR(0);
|
||||
DECLARE_B_PAIR();
|
||||
DECLARE_RESULT_4X(0, 0, 0);
|
||||
for (k_count = k; k_count > 1; k_count -=2) {
|
||||
MASK_LOAD_A_PAIR(0);
|
||||
BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0);
|
||||
ptr_b0 += 4 * 2;
|
||||
ptr_a0 += m_count * 2;
|
||||
}
|
||||
if (k_count > 0) {
|
||||
MASK_LOAD_A_PAIR_TAIL(0);
|
||||
BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0);
|
||||
ptr_b0 += 4;
|
||||
ptr_a0 += m_count;
|
||||
}
|
||||
ptr_c0 = ptr_c;
|
||||
MASK_STORE_4X(0, 0, 0);
|
||||
ptr_c += m_count;
|
||||
}
|
||||
ptr_b += 4 * k;
|
||||
cn_offset += 4;
|
||||
}
|
||||
if (n_count > 0) {
|
||||
__mmask8 nmask = (1UL << n_count) - 1;
|
||||
IFLOAT *ptr_b00 = ptr_b;
|
||||
ptr_a0 = ptr_a;
|
||||
ptr_c = C + cn_offset * ldc;
|
||||
m_count = m;
|
||||
for (; m_count > 15; m_count -= 16) {
|
||||
ptr_b0 = ptr_b00;
|
||||
DECLARE_A_PAIR(0);
|
||||
DECLARE_B_PAIR();
|
||||
DECLARE_RESULT_4X(0, 0, 0);
|
||||
for (k_count = k; k_count > 1; k_count -=2) {
|
||||
LOAD_A_PAIR(0);
|
||||
MASK_BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0);
|
||||
ptr_b0 += n_count * 2;
|
||||
ptr_a0 += 16 * 2;
|
||||
}
|
||||
if (k_count > 0) {
|
||||
LOAD_A_PAIR_TAIL(0);
|
||||
MASK_BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0);
|
||||
ptr_b0 += n_count;
|
||||
ptr_a0 += 16;
|
||||
}
|
||||
ptr_c0 = ptr_c;
|
||||
N_STORE_4X(0, 0, 0);
|
||||
ptr_c += 16;
|
||||
}
|
||||
if (m_count > 0) {
|
||||
__mmask16 mmask = (1UL << m_count) - 1;
|
||||
ptr_b0 = ptr_b00;
|
||||
DECLARE_A_PAIR(0);
|
||||
DECLARE_B_PAIR();
|
||||
DECLARE_RESULT_4X(0, 0, 0);
|
||||
for (k_count = k; k_count > 1; k_count -=2) {
|
||||
MASK_LOAD_A_PAIR(0);
|
||||
MASK_BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0);
|
||||
ptr_b0 += n_count * 2;
|
||||
ptr_a0 += m_count * 2;
|
||||
}
|
||||
if (k_count > 0) {
|
||||
MASK_LOAD_A_PAIR_TAIL(0);
|
||||
MASK_BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0);
|
||||
ptr_b0 += n_count;
|
||||
ptr_a0 += m_count;
|
||||
}
|
||||
ptr_c0 = ptr_c;
|
||||
N_MASK_STORE_4X(0, 0, 0);
|
||||
ptr_c += m_count;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,353 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <immintrin.h>
|
||||
#include "common.h"
|
||||
|
||||
#define _MM512_SHUFFLE_i32(result, in1, in2, imm8) \
|
||||
asm("vshufps %3, %2, %1, %0": "=v"(result): "v"(in1), "v"(in2), "N"(imm8))
|
||||
|
||||
#define REORDER_8x32(t0, t1, t2, t3, t4, t5, t6, t7) { \
|
||||
__m512i v; \
|
||||
t0 = _mm512_unpacklo_epi32(r0, r1); \
|
||||
t1 = _mm512_unpackhi_epi32(r0, r1); \
|
||||
t2 = _mm512_unpacklo_epi32(r2, r3); \
|
||||
t3 = _mm512_unpackhi_epi32(r2, r3); \
|
||||
t4 = _mm512_unpacklo_epi32(r4, r5); \
|
||||
t5 = _mm512_unpackhi_epi32(r4, r5); \
|
||||
t6 = _mm512_unpacklo_epi32(r6, r7); \
|
||||
t7 = _mm512_unpackhi_epi32(r6, r7); \
|
||||
_MM512_SHUFFLE_i32(v, t0, t2, 0x4E); \
|
||||
r0 = _mm512_mask_blend_epi32(kc, t0, v); \
|
||||
r1 = _mm512_mask_blend_epi32(k3, t2, v); \
|
||||
_MM512_SHUFFLE_i32(v, t1, t3, 0x4E); \
|
||||
r2 = _mm512_mask_blend_epi32(kc, t1, v); \
|
||||
r3 = _mm512_mask_blend_epi32(k3, t3, v); \
|
||||
_MM512_SHUFFLE_i32(v, t4, t6, 0x4E); \
|
||||
r4 = _mm512_mask_blend_epi32(kc, t4, v); \
|
||||
r5 = _mm512_mask_blend_epi32(k3, t6, v); \
|
||||
_MM512_SHUFFLE_i32(v, t5, t7, 0x4E); \
|
||||
r6 = _mm512_mask_blend_epi32(kc, t5, v); \
|
||||
r7 = _mm512_mask_blend_epi32(k3, t7, v); \
|
||||
t0 = _mm512_permutex2var_epi32(r0, idx_lo, r4); \
|
||||
t1 = _mm512_permutex2var_epi32(r1, idx_lo, r5); \
|
||||
t2 = _mm512_permutex2var_epi32(r2, idx_lo, r6); \
|
||||
t3 = _mm512_permutex2var_epi32(r3, idx_lo, r7); \
|
||||
t4 = _mm512_permutex2var_epi32(r0, idx_hi, r4); \
|
||||
t5 = _mm512_permutex2var_epi32(r1, idx_hi, r5); \
|
||||
t6 = _mm512_permutex2var_epi32(r2, idx_hi, r6); \
|
||||
t7 = _mm512_permutex2var_epi32(r3, idx_hi, r7); \
|
||||
}
|
||||
|
||||
#define STORE_512_LO(x) \
|
||||
v = _mm512_permutex2var_epi64(t0##x, idx_lo2, t1##x); \
|
||||
_mm512_storeu_si512(boffset0 + x*32, v);
|
||||
|
||||
#define STORE_512_HI(x) \
|
||||
v = _mm512_permutex2var_epi64(t0##x, idx_hi2, t1##x); \
|
||||
_mm512_storeu_si512(boffset0 + (x + 8)*32, v);
|
||||
|
||||
#define MASK_STORE_512_LO(x) \
|
||||
v = _mm512_permutex2var_epi64(t0##x, idx_lo2, t1##x); \
|
||||
_mm512_mask_storeu_epi32(boffset0 + 2*x*remain_n, nmask, v);
|
||||
|
||||
#define MASK_STORE_512_HI(x) \
|
||||
v = _mm512_permutex2var_epi64(t0##x, idx_hi2, t1##x); \
|
||||
_mm512_mask_storeu_epi32(boffset0 + 2*(x + 8)*remain_n, nmask, v);
|
||||
|
||||
#define STORE_512(x, y) {\
|
||||
__m512i v; \
|
||||
if (x == 0) { STORE_512_LO(y); } \
|
||||
else { STORE_512_HI(y); } \
|
||||
}
|
||||
|
||||
#define MASK_STORE_512(x, y) {\
|
||||
__m512i v; \
|
||||
if (x == 0) { MASK_STORE_512_LO(y); } \
|
||||
else { MASK_STORE_512_HI(y); } \
|
||||
}
|
||||
|
||||
#define SET_TAIL(y, x) {\
|
||||
if (y == 0) tail = _mm512_permutex2var_epi64(t0##x, idx_lo2, t1##x); \
|
||||
else tail = _mm512_permutex2var_epi64(t0##x, idx_hi2, t1##x); \
|
||||
}
|
||||
|
||||
#define GET_TAIL() \
|
||||
switch (n_store + 1) { \
|
||||
case 16: SET_TAIL(1, 7); break; \
|
||||
case 15: SET_TAIL(1, 6); break; \
|
||||
case 14: SET_TAIL(1, 5); break; \
|
||||
case 13: SET_TAIL(1, 4); break; \
|
||||
case 12: SET_TAIL(1, 3); break; \
|
||||
case 11: SET_TAIL(1, 2); break; \
|
||||
case 10: SET_TAIL(1, 1); break; \
|
||||
case 9: SET_TAIL(1, 0); break; \
|
||||
case 8: SET_TAIL(0, 7); break; \
|
||||
case 7: SET_TAIL(0, 6); break; \
|
||||
case 6: SET_TAIL(0, 5); break; \
|
||||
case 5: SET_TAIL(0, 4); break; \
|
||||
case 4: SET_TAIL(0, 3); break; \
|
||||
case 3: SET_TAIL(0, 2); break; \
|
||||
case 2: SET_TAIL(0, 1); break; \
|
||||
case 1: SET_TAIL(0, 0); break; \
|
||||
}
|
||||
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||
BLASLONG i, j;
|
||||
|
||||
IFLOAT *boffset0;
|
||||
IFLOAT *aoffset;
|
||||
IFLOAT *aoffset00, *aoffset01, *aoffset02, *aoffset03, *aoffset04, *aoffset05, *aoffset06, *aoffset07;
|
||||
IFLOAT *aoffset10, *aoffset11, *aoffset12, *aoffset13, *aoffset14, *aoffset15, *aoffset16, *aoffset17;
|
||||
aoffset = a;
|
||||
boffset0 = b;
|
||||
|
||||
BLASLONG n16 = n & ~15;
|
||||
BLASLONG m32 = m & ~31;
|
||||
|
||||
int permute_table[] = {
|
||||
0x0, 0x1, 0x2, 0x3, 0x10, 0x11, 0x12, 0x13, 0x8, 0x9, 0xa, 0xb, 0x18, 0x19, 0x1a, 0x1b,
|
||||
0x4, 0x5, 0x6, 0x7, 0x14, 0x15, 0x16, 0x17, 0xc, 0xd, 0xe, 0xf, 0x1c, 0x1d, 0x1e, 0x1f,
|
||||
};
|
||||
u_int64_t permute_table2[] = {
|
||||
0x00, 0x01, 0x02, 0x03, 8|0x0, 8|0x1, 8|0x2, 8|0x3,
|
||||
0x04, 0x05, 0x06, 0x07, 8|0x4, 8|0x5, 8|0x6, 8|0x7,
|
||||
};
|
||||
__m512i idx_lo = _mm512_loadu_si512(permute_table);
|
||||
__m512i idx_hi = _mm512_loadu_si512(permute_table + 16);
|
||||
__m512i idx_lo2 = _mm512_loadu_si512(permute_table2);
|
||||
__m512i idx_hi2 = _mm512_loadu_si512(permute_table2 + 8);
|
||||
__mmask16 kc = 0xcccc;
|
||||
__mmask16 k3 = 0x3333;
|
||||
__m512i r0, r1, r2, r3, r4, r5, r6, r7;
|
||||
__m512i t00, t01, t02, t03, t04, t05, t06, t07;
|
||||
__m512i t10, t11, t12, t13, t14, t15, t16, t17;
|
||||
|
||||
for (j = 0; j < n16; j += 16) {
|
||||
aoffset00 = aoffset;
|
||||
aoffset01 = aoffset00 + lda;
|
||||
aoffset02 = aoffset01 + lda;
|
||||
aoffset03 = aoffset02 + lda;
|
||||
aoffset04 = aoffset03 + lda;
|
||||
aoffset05 = aoffset04 + lda;
|
||||
aoffset06 = aoffset05 + lda;
|
||||
aoffset07 = aoffset06 + lda;
|
||||
aoffset10 = aoffset07 + lda;
|
||||
aoffset11 = aoffset10 + lda;
|
||||
aoffset12 = aoffset11 + lda;
|
||||
aoffset13 = aoffset12 + lda;
|
||||
aoffset14 = aoffset13 + lda;
|
||||
aoffset15 = aoffset14 + lda;
|
||||
aoffset16 = aoffset15 + lda;
|
||||
aoffset17 = aoffset16 + lda;
|
||||
aoffset += 16 * lda;
|
||||
for (i = 0; i < m32; i += 32) {
|
||||
r0 = _mm512_loadu_si512(aoffset00 + i);
|
||||
r1 = _mm512_loadu_si512(aoffset01 + i);
|
||||
r2 = _mm512_loadu_si512(aoffset02 + i);
|
||||
r3 = _mm512_loadu_si512(aoffset03 + i);
|
||||
r4 = _mm512_loadu_si512(aoffset04 + i);
|
||||
r5 = _mm512_loadu_si512(aoffset05 + i);
|
||||
r6 = _mm512_loadu_si512(aoffset06 + i);
|
||||
r7 = _mm512_loadu_si512(aoffset07 + i);
|
||||
REORDER_8x32(t00, t01, t02, t03, t04, t05, t06, t07);
|
||||
r0 = _mm512_loadu_si512(aoffset10 + i);
|
||||
r1 = _mm512_loadu_si512(aoffset11 + i);
|
||||
r2 = _mm512_loadu_si512(aoffset12 + i);
|
||||
r3 = _mm512_loadu_si512(aoffset13 + i);
|
||||
r4 = _mm512_loadu_si512(aoffset14 + i);
|
||||
r5 = _mm512_loadu_si512(aoffset15 + i);
|
||||
r6 = _mm512_loadu_si512(aoffset16 + i);
|
||||
r7 = _mm512_loadu_si512(aoffset17 + i);
|
||||
REORDER_8x32(t10, t11, t12, t13, t14, t15, t16, t17);
|
||||
STORE_512(0, 0); STORE_512(0, 1); STORE_512(0, 2); STORE_512(0, 3);
|
||||
STORE_512(0, 4); STORE_512(0, 5); STORE_512(0, 6); STORE_512(0, 7);
|
||||
STORE_512(1, 0); STORE_512(1, 1); STORE_512(1, 2); STORE_512(1, 3);
|
||||
STORE_512(1, 4); STORE_512(1, 5); STORE_512(1, 6); STORE_512(1, 7);
|
||||
boffset0 += 16 * 32;
|
||||
}
|
||||
if (i < m) {
|
||||
int remain_m = m - i;
|
||||
__mmask32 mmask = (1UL << remain_m) - 1;
|
||||
r0 = _mm512_maskz_loadu_epi16(mmask, aoffset00 + i);
|
||||
r1 = _mm512_maskz_loadu_epi16(mmask, aoffset01 + i);
|
||||
r2 = _mm512_maskz_loadu_epi16(mmask, aoffset02 + i);
|
||||
r3 = _mm512_maskz_loadu_epi16(mmask, aoffset03 + i);
|
||||
r4 = _mm512_maskz_loadu_epi16(mmask, aoffset04 + i);
|
||||
r5 = _mm512_maskz_loadu_epi16(mmask, aoffset05 + i);
|
||||
r6 = _mm512_maskz_loadu_epi16(mmask, aoffset06 + i);
|
||||
r7 = _mm512_maskz_loadu_epi16(mmask, aoffset07 + i);
|
||||
REORDER_8x32(t00, t01, t02, t03, t04, t05, t06, t07);
|
||||
r0 = _mm512_maskz_loadu_epi16(mmask, aoffset10 + i);
|
||||
r1 = _mm512_maskz_loadu_epi16(mmask, aoffset11 + i);
|
||||
r2 = _mm512_maskz_loadu_epi16(mmask, aoffset12 + i);
|
||||
r3 = _mm512_maskz_loadu_epi16(mmask, aoffset13 + i);
|
||||
r4 = _mm512_maskz_loadu_epi16(mmask, aoffset14 + i);
|
||||
r5 = _mm512_maskz_loadu_epi16(mmask, aoffset15 + i);
|
||||
r6 = _mm512_maskz_loadu_epi16(mmask, aoffset16 + i);
|
||||
r7 = _mm512_maskz_loadu_epi16(mmask, aoffset17 + i);
|
||||
REORDER_8x32(t10, t11, t12, t13, t14, t15, t16, t17);
|
||||
int n_store = remain_m/2;
|
||||
switch (n_store) {
|
||||
case 15: STORE_512(1, 6);
|
||||
case 14: STORE_512(1, 5);
|
||||
case 13: STORE_512(1, 4);
|
||||
case 12: STORE_512(1, 3);
|
||||
case 11: STORE_512(1, 2);
|
||||
case 10: STORE_512(1, 1);
|
||||
case 9: STORE_512(1, 0);
|
||||
case 8: STORE_512(0, 7);
|
||||
case 7: STORE_512(0, 6);
|
||||
case 6: STORE_512(0, 5);
|
||||
case 5: STORE_512(0, 4);
|
||||
case 4: STORE_512(0, 3);
|
||||
case 3: STORE_512(0, 2);
|
||||
case 2: STORE_512(0, 1);
|
||||
case 1: STORE_512(0, 0);
|
||||
}
|
||||
boffset0 += n_store * 32;
|
||||
if (m & 0x1) {
|
||||
__m512i tail;
|
||||
GET_TAIL();
|
||||
_mm256_storeu_si256((void *)boffset0, _mm512_cvtepi32_epi16(tail));
|
||||
boffset0 += 16;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
if (j < n) {
|
||||
int remain_n = n - j;
|
||||
__mmask16 nmask = (1UL << remain_n) - 1;
|
||||
int load0, load1;
|
||||
if (remain_n > 8) {
|
||||
load0 = 8;
|
||||
load1 = remain_n - 8;
|
||||
} else {
|
||||
load0 = remain_n;
|
||||
load1 = 0;
|
||||
}
|
||||
aoffset00 = aoffset;
|
||||
aoffset01 = aoffset00 + lda;
|
||||
aoffset02 = aoffset01 + lda;
|
||||
aoffset03 = aoffset02 + lda;
|
||||
aoffset04 = aoffset03 + lda;
|
||||
aoffset05 = aoffset04 + lda;
|
||||
aoffset06 = aoffset05 + lda;
|
||||
aoffset07 = aoffset06 + lda;
|
||||
aoffset10 = aoffset07 + lda;
|
||||
aoffset11 = aoffset10 + lda;
|
||||
aoffset12 = aoffset11 + lda;
|
||||
aoffset13 = aoffset12 + lda;
|
||||
aoffset14 = aoffset13 + lda;
|
||||
aoffset15 = aoffset14 + lda;
|
||||
aoffset16 = aoffset15 + lda;
|
||||
aoffset17 = aoffset16 + lda;
|
||||
aoffset += 16 * lda;
|
||||
for (i = 0; i < m32; i += 32) {
|
||||
switch (load0) {
|
||||
case 8: r7 = _mm512_loadu_si512(aoffset07 + i);
|
||||
case 7: r6 = _mm512_loadu_si512(aoffset06 + i);
|
||||
case 6: r5 = _mm512_loadu_si512(aoffset05 + i);
|
||||
case 5: r4 = _mm512_loadu_si512(aoffset04 + i);
|
||||
case 4: r3 = _mm512_loadu_si512(aoffset03 + i);
|
||||
case 3: r2 = _mm512_loadu_si512(aoffset02 + i);
|
||||
case 2: r1 = _mm512_loadu_si512(aoffset01 + i);
|
||||
case 1: r0 = _mm512_loadu_si512(aoffset00 + i);
|
||||
}
|
||||
REORDER_8x32(t00, t01, t02, t03, t04, t05, t06, t07);
|
||||
switch (load1) {
|
||||
case 8: r7 = _mm512_loadu_si512(aoffset17 + i);
|
||||
case 7: r6 = _mm512_loadu_si512(aoffset16 + i);
|
||||
case 6: r5 = _mm512_loadu_si512(aoffset15 + i);
|
||||
case 5: r4 = _mm512_loadu_si512(aoffset14 + i);
|
||||
case 4: r3 = _mm512_loadu_si512(aoffset13 + i);
|
||||
case 3: r2 = _mm512_loadu_si512(aoffset12 + i);
|
||||
case 2: r1 = _mm512_loadu_si512(aoffset11 + i);
|
||||
case 1: r0 = _mm512_loadu_si512(aoffset10 + i);
|
||||
}
|
||||
REORDER_8x32(t10, t11, t12, t13, t14, t15, t16, t17);
|
||||
MASK_STORE_512(0, 0); MASK_STORE_512(0, 1); MASK_STORE_512(0, 2); MASK_STORE_512(0, 3);
|
||||
MASK_STORE_512(0, 4); MASK_STORE_512(0, 5); MASK_STORE_512(0, 6); MASK_STORE_512(0, 7);
|
||||
MASK_STORE_512(1, 0); MASK_STORE_512(1, 1); MASK_STORE_512(1, 2); MASK_STORE_512(1, 3);
|
||||
MASK_STORE_512(1, 4); MASK_STORE_512(1, 5); MASK_STORE_512(1, 6); MASK_STORE_512(1, 7);
|
||||
boffset0 += remain_n * 32;
|
||||
}
|
||||
if (i < m) {
|
||||
int remain_m = m - i;
|
||||
__mmask32 mmask = (1UL << remain_m) - 1;
|
||||
switch (load0) {
|
||||
case 8: r7 = _mm512_maskz_loadu_epi16(mmask, aoffset07 + i);
|
||||
case 7: r6 = _mm512_maskz_loadu_epi16(mmask, aoffset06 + i);
|
||||
case 6: r5 = _mm512_maskz_loadu_epi16(mmask, aoffset05 + i);
|
||||
case 5: r4 = _mm512_maskz_loadu_epi16(mmask, aoffset04 + i);
|
||||
case 4: r3 = _mm512_maskz_loadu_epi16(mmask, aoffset03 + i);
|
||||
case 3: r2 = _mm512_maskz_loadu_epi16(mmask, aoffset02 + i);
|
||||
case 2: r1 = _mm512_maskz_loadu_epi16(mmask, aoffset01 + i);
|
||||
case 1: r0 = _mm512_maskz_loadu_epi16(mmask, aoffset00 + i);
|
||||
}
|
||||
REORDER_8x32(t00, t01, t02, t03, t04, t05, t06, t07);
|
||||
switch (load1) {
|
||||
case 8: r7 = _mm512_maskz_loadu_epi16(mmask, aoffset17 + i);
|
||||
case 7: r6 = _mm512_maskz_loadu_epi16(mmask, aoffset16 + i);
|
||||
case 6: r5 = _mm512_maskz_loadu_epi16(mmask, aoffset15 + i);
|
||||
case 5: r4 = _mm512_maskz_loadu_epi16(mmask, aoffset14 + i);
|
||||
case 4: r3 = _mm512_maskz_loadu_epi16(mmask, aoffset13 + i);
|
||||
case 3: r2 = _mm512_maskz_loadu_epi16(mmask, aoffset12 + i);
|
||||
case 2: r1 = _mm512_maskz_loadu_epi16(mmask, aoffset11 + i);
|
||||
case 1: r0 = _mm512_maskz_loadu_epi16(mmask, aoffset10 + i);
|
||||
}
|
||||
REORDER_8x32(t10, t11, t12, t13, t14, t15, t16, t17);
|
||||
int n_store = remain_m/2;
|
||||
switch (n_store) {
|
||||
case 15: MASK_STORE_512(1, 6);
|
||||
case 14: MASK_STORE_512(1, 5);
|
||||
case 13: MASK_STORE_512(1, 4);
|
||||
case 12: MASK_STORE_512(1, 3);
|
||||
case 11: MASK_STORE_512(1, 2);
|
||||
case 10: MASK_STORE_512(1, 1);
|
||||
case 9: MASK_STORE_512(1, 0);
|
||||
case 8: MASK_STORE_512(0, 7);
|
||||
case 7: MASK_STORE_512(0, 6);
|
||||
case 6: MASK_STORE_512(0, 5);
|
||||
case 5: MASK_STORE_512(0, 4);
|
||||
case 4: MASK_STORE_512(0, 3);
|
||||
case 3: MASK_STORE_512(0, 2);
|
||||
case 2: MASK_STORE_512(0, 1);
|
||||
case 1: MASK_STORE_512(0, 0);
|
||||
}
|
||||
boffset0 += n_store * remain_n * 2;
|
||||
if (m & 0x1) {
|
||||
__m512i tail;
|
||||
GET_TAIL();
|
||||
_mm256_mask_storeu_epi16((void *)boffset0, nmask, _mm512_cvtepi32_epi16(tail));
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,208 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <immintrin.h>
|
||||
#include "common.h"
|
||||
|
||||
#define REORDER_4x32(r0, r1, r2, r3) {\
|
||||
__m512i t0, t1, t2, t3; \
|
||||
t0 = _mm512_unpacklo_epi32(r0, r1); \
|
||||
t1 = _mm512_unpackhi_epi32(r0, r1); \
|
||||
t2 = _mm512_unpacklo_epi32(r2, r3); \
|
||||
t3 = _mm512_unpackhi_epi32(r2, r3); \
|
||||
r0 = _mm512_unpacklo_epi64(t0, t2); \
|
||||
r1 = _mm512_unpackhi_epi64(t0, t2); \
|
||||
r2 = _mm512_unpacklo_epi64(t1, t3); \
|
||||
r3 = _mm512_unpackhi_epi64(t1, t3); \
|
||||
t0 = _mm512_permutex2var_epi32(r0, idx_lo_128, r1); \
|
||||
t1 = _mm512_permutex2var_epi32(r0, idx_hi_128, r1); \
|
||||
t2 = _mm512_permutex2var_epi32(r2, idx_lo_128, r3); \
|
||||
t3 = _mm512_permutex2var_epi32(r2, idx_hi_128, r3); \
|
||||
r0 = _mm512_permutex2var_epi32(t0, idx_lo_256, t2); \
|
||||
r1 = _mm512_permutex2var_epi32(t1, idx_lo_256, t3); \
|
||||
r2 = _mm512_permutex2var_epi32(t0, idx_hi_256, t2); \
|
||||
r3 = _mm512_permutex2var_epi32(t1, idx_hi_256, t3); \
|
||||
}
|
||||
|
||||
#define REORDER_4x8(r0, r1, r2, r3) {\
|
||||
__m128i t0, t1, t2, t3; \
|
||||
t0 = _mm_unpacklo_epi32(r0, r1); \
|
||||
t1 = _mm_unpackhi_epi32(r0, r1); \
|
||||
t2 = _mm_unpacklo_epi32(r2, r3); \
|
||||
t3 = _mm_unpackhi_epi32(r2, r3); \
|
||||
r0 = _mm_unpacklo_epi64(t0, t2); \
|
||||
r1 = _mm_unpackhi_epi64(t0, t2); \
|
||||
r2 = _mm_unpacklo_epi64(t1, t3); \
|
||||
r3 = _mm_unpackhi_epi64(t1, t3); \
|
||||
}
|
||||
|
||||
#define GET_TAIL(tail, remain_m) \
|
||||
switch((remain_m + 1)/2) { \
|
||||
case 1: tail = r0; break; \
|
||||
case 2: tail = r1; break; \
|
||||
case 3: tail = r2; break; \
|
||||
case 4: tail = r3; break; \
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||
BLASLONG i, j;
|
||||
IFLOAT *aoffset;
|
||||
IFLOAT *aoffset0, *aoffset1, *aoffset2, *aoffset3;
|
||||
|
||||
IFLOAT *boffset;
|
||||
|
||||
aoffset = a;
|
||||
boffset = b;
|
||||
|
||||
BLASLONG m32 = m & ~31;
|
||||
BLASLONG m8 = m & ~7;
|
||||
BLASLONG n4 = n & ~3;
|
||||
|
||||
int permute_table[] = {
|
||||
0x0, 0x1, 0x2, 0x3, 0x10, 0x11, 0x12, 0x13, 0x8, 0x9, 0xa, 0xb, 0x18, 0x19, 0x1a, 0x1b,
|
||||
0x4, 0x5, 0x6, 0x7, 0x14, 0x15, 0x16, 0x17, 0xc, 0xd, 0xe, 0xf, 0x1c, 0x1d, 0x1e, 0x1f,
|
||||
0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
|
||||
0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
|
||||
};
|
||||
__m512i idx_lo_128 = _mm512_loadu_si512(permute_table);
|
||||
__m512i idx_hi_128 = _mm512_loadu_si512(permute_table + 16);
|
||||
__m512i idx_lo_256 = _mm512_loadu_si512(permute_table + 32);
|
||||
__m512i idx_hi_256 = _mm512_loadu_si512(permute_table + 48);
|
||||
|
||||
for (j = 0; j < n4; j += 4) {
|
||||
aoffset0 = aoffset;
|
||||
aoffset1 = aoffset0 + lda;
|
||||
aoffset2 = aoffset1 + lda;
|
||||
aoffset3 = aoffset2 + lda;
|
||||
aoffset += 4 * lda;
|
||||
|
||||
for (i = 0; i < m32; i += 32) {
|
||||
__m512i r0, r1, r2, r3;
|
||||
r0 = _mm512_loadu_si512(aoffset0 + i);
|
||||
r1 = _mm512_loadu_si512(aoffset1 + i);
|
||||
r2 = _mm512_loadu_si512(aoffset2 + i);
|
||||
r3 = _mm512_loadu_si512(aoffset3 + i);
|
||||
REORDER_4x32(r0, r1, r2, r3);
|
||||
_mm512_storeu_si512(boffset + 32*0, r0);
|
||||
_mm512_storeu_si512(boffset + 32*1, r1);
|
||||
_mm512_storeu_si512(boffset + 32*2, r2);
|
||||
_mm512_storeu_si512(boffset + 32*3, r3);
|
||||
boffset += 32 * 4;
|
||||
}
|
||||
for (; i < m8; i += 8) {
|
||||
__m128i r0 = _mm_loadu_si128((void *)(aoffset0 + i));
|
||||
__m128i r1 = _mm_loadu_si128((void *)(aoffset1 + i));
|
||||
__m128i r2 = _mm_loadu_si128((void *)(aoffset2 + i));
|
||||
__m128i r3 = _mm_loadu_si128((void *)(aoffset3 + i));
|
||||
REORDER_4x8(r0, r1, r2, r3);
|
||||
_mm_storeu_si128((void *)(boffset + 8*0), r0);
|
||||
_mm_storeu_si128((void *)(boffset + 8*1), r1);
|
||||
_mm_storeu_si128((void *)(boffset + 8*2), r2);
|
||||
_mm_storeu_si128((void *)(boffset + 8*3), r3);
|
||||
boffset += 8 * 4;
|
||||
}
|
||||
if (i < m) {
|
||||
int remain_m = m - i;
|
||||
__mmask8 r_mask = (1UL << remain_m) - 1;
|
||||
__m128i r0 = _mm_maskz_loadu_epi16(r_mask, aoffset0 + i);
|
||||
__m128i r1 = _mm_maskz_loadu_epi16(r_mask, aoffset1 + i);
|
||||
__m128i r2 = _mm_maskz_loadu_epi16(r_mask, aoffset2 + i);
|
||||
__m128i r3 = _mm_maskz_loadu_epi16(r_mask, aoffset3 + i);
|
||||
REORDER_4x8(r0, r1, r2, r3);
|
||||
|
||||
// store should skip the tail odd line
|
||||
int num_store = remain_m/2;
|
||||
switch(num_store) {
|
||||
case 3: _mm_storeu_si128((void *)(boffset + 8*2), r2);
|
||||
case 2: _mm_storeu_si128((void *)(boffset + 8*1), r1);
|
||||
case 1: _mm_storeu_si128((void *)(boffset + 8*0), r0);
|
||||
}
|
||||
boffset += 8 * num_store;
|
||||
|
||||
if (m & 0x1) { // handling the tail
|
||||
__m128i tail;
|
||||
GET_TAIL(tail, remain_m);
|
||||
/* tail vector is fill with zero like:
|
||||
* a, 0, b, 0, c, 0, d, 0
|
||||
* need to extract lo words of data and store
|
||||
*/
|
||||
tail = _mm_cvtepi32_epi16(tail);
|
||||
_mm_store_sd((double *)boffset, (__m128d) tail); // only lower 4 bfloat valid
|
||||
boffset += 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (j < n) {
|
||||
int remain_n = n - j;
|
||||
__mmask8 nmask = (1UL << remain_n) - 1;
|
||||
aoffset0 = aoffset;
|
||||
aoffset1 = aoffset0 + lda;
|
||||
aoffset2 = aoffset1 + lda;
|
||||
aoffset3 = aoffset2 + lda;
|
||||
__m128i r0, r1, r2, r3;
|
||||
for (i = 0; i < m8; i += 8) {
|
||||
switch (remain_n) {
|
||||
case 3: r2 = _mm_loadu_si128((void *)(aoffset2 + i));
|
||||
case 2: r1 = _mm_loadu_si128((void *)(aoffset1 + i));
|
||||
case 1: r0 = _mm_loadu_si128((void *)(aoffset0 + i));
|
||||
}
|
||||
REORDER_4x8(r0, r1, r2, r3);
|
||||
_mm_mask_storeu_epi32(boffset + remain_n * 0, nmask, r0);
|
||||
_mm_mask_storeu_epi32(boffset + remain_n * 2, nmask, r1);
|
||||
_mm_mask_storeu_epi32(boffset + remain_n * 4, nmask, r2);
|
||||
_mm_mask_storeu_epi32(boffset + remain_n * 6, nmask, r3);
|
||||
boffset += 8 * remain_n;
|
||||
}
|
||||
if (i < m) {
|
||||
int remain_m = m - i;
|
||||
__mmask8 mmask = (1UL << remain_m) - 1;
|
||||
switch (remain_n) {
|
||||
case 3: r2 = _mm_maskz_loadu_epi16(mmask, aoffset2 + i);
|
||||
case 2: r1 = _mm_maskz_loadu_epi16(mmask, aoffset1 + i);
|
||||
case 1: r0 = _mm_maskz_loadu_epi16(mmask, aoffset0 + i);
|
||||
}
|
||||
REORDER_4x8(r0, r1, r2, r3);
|
||||
|
||||
int num_store = remain_m/2;
|
||||
switch (num_store) {
|
||||
case 3: _mm_mask_storeu_epi32(boffset + remain_n * 4, nmask, r2);
|
||||
case 2: _mm_mask_storeu_epi32(boffset + remain_n * 2, nmask, r1);
|
||||
case 1: _mm_mask_storeu_epi32(boffset + remain_n * 0, nmask, r0);
|
||||
}
|
||||
boffset += 2 * num_store * remain_n;
|
||||
|
||||
if (m & 0x1) {
|
||||
__m128i tail;
|
||||
GET_TAIL(tail, remain_m);
|
||||
tail = _mm_cvtepi32_epi16(tail);
|
||||
_mm_mask_storeu_epi16(boffset, nmask, tail);
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
|
@ -38,5 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta)
|
||||
{
|
||||
double MNK = (double) M * (double) N * (double) K;
|
||||
if (MNK > 256.0*256.0*256.0) // disable for big size matrix
|
||||
return 0;
|
||||
/* small matrix kernel works well for N = 8, 16, 32 */
|
||||
if (N == 8 || N == 16 || N == 32)
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,164 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <immintrin.h>
|
||||
#include "common.h"
|
||||
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||
BLASLONG i, j;
|
||||
|
||||
IFLOAT *boffset0, *boffset1;
|
||||
|
||||
boffset0 = b;
|
||||
|
||||
BLASLONG n32 = n & ~31;
|
||||
BLASLONG m4 = m & ~3;
|
||||
BLASLONG m2 = m & ~1;
|
||||
|
||||
uint32_t permute_table[] = {
|
||||
0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
|
||||
0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b, 0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f,
|
||||
};
|
||||
|
||||
__m512i idx_lo = _mm512_loadu_si512(permute_table);
|
||||
__m512i idx_hi = _mm512_loadu_si512(permute_table + 16);
|
||||
|
||||
for (j = 0; j < n32; j += 32) {
|
||||
/* process 2x16 n at the same time */
|
||||
boffset1 = boffset0 + m * 16;
|
||||
for (i = 0; i < m4; i += 4) {
|
||||
/* bf16 fma need special memory layout:
|
||||
* for memory layout like below:
|
||||
* a00, a01, a02, a03, a04, a05 ....
|
||||
* a10, a11, a12, a13, a14, a15 ....
|
||||
* need to copy as:
|
||||
* a00, a10, a01, a11, a02, a12, a03, a13, ...
|
||||
*/
|
||||
__m512i a0 = _mm512_loadu_si512(&a[(i + 0)*lda + j]);
|
||||
__m512i a1 = _mm512_loadu_si512(&a[(i + 1)*lda + j]);
|
||||
__m512i a2 = _mm512_loadu_si512(&a[(i + 2)*lda + j]);
|
||||
__m512i a3 = _mm512_loadu_si512(&a[(i + 3)*lda + j]);
|
||||
|
||||
__m512i a00 = _mm512_unpacklo_epi16(a0, a1);
|
||||
__m512i a01 = _mm512_unpackhi_epi16(a0, a1);
|
||||
__m512i a10 = _mm512_unpacklo_epi16(a2, a3);
|
||||
__m512i a11 = _mm512_unpackhi_epi16(a2, a3);
|
||||
|
||||
a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01);
|
||||
a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01);
|
||||
a2 = _mm512_permutex2var_epi32(a10, idx_lo, a11);
|
||||
a3 = _mm512_permutex2var_epi32(a10, idx_hi, a11);
|
||||
|
||||
_mm512_storeu_si512(boffset0, a0);
|
||||
_mm512_storeu_si512(boffset1, a1);
|
||||
_mm512_storeu_si512(boffset0 + 32, a2);
|
||||
_mm512_storeu_si512(boffset1 + 32, a3);
|
||||
boffset0 += 64;
|
||||
boffset1 += 64;
|
||||
}
|
||||
for (; i < m2; i += 2) {
|
||||
__m512i a0 = _mm512_loadu_si512(&a[(i + 0)*lda + j]);
|
||||
__m512i a1 = _mm512_loadu_si512(&a[(i + 1)*lda + j]);
|
||||
|
||||
__m512i a00 = _mm512_unpacklo_epi16(a0, a1);
|
||||
__m512i a01 = _mm512_unpackhi_epi16(a0, a1);
|
||||
|
||||
a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01);
|
||||
a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01);
|
||||
|
||||
_mm512_storeu_si512(boffset0, a0);
|
||||
_mm512_storeu_si512(boffset1, a1);
|
||||
boffset0 += 32;
|
||||
boffset1 += 32;
|
||||
}
|
||||
for (; i < m; i++) {
|
||||
/* just copy the only remains row */
|
||||
__m256i a0 = _mm256_loadu_si256((void *)&a[(i + 0)*lda + j]);
|
||||
__m256i a1 = _mm256_loadu_si256((void *)&a[(i + 0)*lda + j + 16]);
|
||||
_mm256_storeu_si256((void *)boffset0, a0);
|
||||
_mm256_storeu_si256((void *)boffset1, a1);
|
||||
boffset0 += 16;
|
||||
boffset1 += 16;
|
||||
}
|
||||
boffset0 = boffset1;
|
||||
}
|
||||
if (j < n) {
|
||||
uint32_t remains = n - j;
|
||||
__mmask32 r_mask = (1UL << remains) - 1;
|
||||
if (remains > 16) {
|
||||
boffset1 = boffset0 + m * 16;
|
||||
uint32_t tail1 = remains - 16;
|
||||
__mmask16 w_mask1 = (1UL << tail1) - 1;
|
||||
for (i = 0; i < m2; i += 2) {
|
||||
__m512i a0 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]);
|
||||
__m512i a1 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]);
|
||||
|
||||
__m512i a00 = _mm512_unpacklo_epi16(a0, a1);
|
||||
__m512i a01 = _mm512_unpackhi_epi16(a0, a1);
|
||||
|
||||
a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01);
|
||||
a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01);
|
||||
|
||||
_mm512_storeu_si512(boffset0, a0);
|
||||
_mm512_mask_storeu_epi32(boffset1, w_mask1, a1);
|
||||
|
||||
boffset0 += 32;
|
||||
boffset1 += 2 * tail1;
|
||||
}
|
||||
for (; i < m; i++) {
|
||||
__m256i a0 = _mm256_loadu_si256((void *)&a[(i + 0)*lda + j]);
|
||||
__m256i a1 = _mm256_maskz_loadu_epi16(w_mask1, (void *)&a[(i + 0)*lda + j + 16]);
|
||||
_mm256_storeu_si256((void *)boffset0, a0);
|
||||
_mm256_mask_storeu_epi16((void *)boffset1, w_mask1, a1);
|
||||
boffset0 += 16;
|
||||
boffset1 += tail1;
|
||||
}
|
||||
} else {
|
||||
__mmask16 w_mask = (1UL << remains ) - 1;
|
||||
for (i = 0; i < m2; i += 2) {
|
||||
__m512i a0 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]);
|
||||
__m512i a1 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]);
|
||||
|
||||
__m512i a00 = _mm512_unpacklo_epi16(a0, a1);
|
||||
__m512i a01 = _mm512_unpackhi_epi16(a0, a1);
|
||||
|
||||
a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01);
|
||||
|
||||
_mm512_mask_storeu_epi32(boffset0, w_mask, a0);
|
||||
boffset0 += 2 * remains;
|
||||
}
|
||||
for (; i < m; i++) {
|
||||
__m256i a0 = _mm256_maskz_loadu_epi16(w_mask, &a[(i + 0)*lda + j]);
|
||||
_mm256_mask_storeu_epi16(boffset0, w_mask, a0);
|
||||
boffset0 += remains;
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,216 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <immintrin.h>
|
||||
#include "common.h"
|
||||
|
||||
#define STORE_VEC(Bx, By, vec) \
|
||||
if (By == 0) asm("vmovdqu16 %0, (%1)": : "v"(vec), "r"(boffset##Bx)); \
|
||||
else asm("vmovdqu16 %0, (%1, %2, %c3)": : "v"(vec), "r"(boffset##Bx), "r"(blk_size), "n"(By * 2));
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||
BLASLONG i, j;
|
||||
|
||||
IFLOAT *boffset0, *boffset1;
|
||||
|
||||
boffset0 = b;
|
||||
|
||||
BLASLONG n24 = n - (n % 24);
|
||||
BLASLONG n8 = n & ~7;
|
||||
BLASLONG m8 = m & ~7;
|
||||
BLASLONG m4 = m & ~3;
|
||||
BLASLONG m2 = m & ~1;
|
||||
|
||||
int permute_table[] = {
|
||||
0x0, 0x1, 0x2, 0x3, 0x10, 0x11, 0x12, 0x13, 0x8, 0x9, 0xa, 0xb, 0x18, 0x19, 0x1a, 0x1b,
|
||||
0x4, 0x5, 0x6, 0x7, 0x14, 0x15, 0x16, 0x17, 0xc, 0xd, 0xe, 0xf, 0x1c, 0x1d, 0x1e, 0x1f,
|
||||
0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
|
||||
0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
|
||||
};
|
||||
|
||||
j = 0;
|
||||
if (n > 23) {
|
||||
/* n = 24 is the max width in current blocking setting */
|
||||
__m512i idx_lo_128 = _mm512_loadu_si512(permute_table);
|
||||
__m512i idx_hi_128 = _mm512_loadu_si512(permute_table + 16);
|
||||
__m512i idx_lo_256 = _mm512_loadu_si512(permute_table + 32);
|
||||
__m512i idx_hi_256 = _mm512_loadu_si512(permute_table + 48);
|
||||
__mmask32 mask24 = (1UL << 24) - 1;
|
||||
BLASLONG blk_size = m * 4;
|
||||
BLASLONG stride = blk_size * 3;
|
||||
|
||||
for (; j < n24; j += 24) {
|
||||
boffset1 = boffset0 + stride;
|
||||
for (i = 0; i < m8; i += 8) {
|
||||
__m512i r0, r1, r2, r3, r4, r5, r6, r7;
|
||||
__m512i t0, t1, t2, t3, t4, t5, t6, t7;
|
||||
r0 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 0)*lda + j]);
|
||||
r1 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 1)*lda + j]);
|
||||
r2 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 2)*lda + j]);
|
||||
r3 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 3)*lda + j]);
|
||||
r4 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 4)*lda + j]);
|
||||
r5 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 5)*lda + j]);
|
||||
r6 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 6)*lda + j]);
|
||||
r7 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 7)*lda + j]);
|
||||
|
||||
t0 = _mm512_unpacklo_epi16(r0, r1);
|
||||
t1 = _mm512_unpackhi_epi16(r0, r1);
|
||||
t2 = _mm512_unpacklo_epi16(r2, r3);
|
||||
t3 = _mm512_unpackhi_epi16(r2, r3);
|
||||
t4 = _mm512_unpacklo_epi16(r4, r5);
|
||||
t5 = _mm512_unpackhi_epi16(r4, r5);
|
||||
t6 = _mm512_unpacklo_epi16(r6, r7);
|
||||
t7 = _mm512_unpackhi_epi16(r6, r7);
|
||||
|
||||
r0 = _mm512_permutex2var_epi32(t0, idx_lo_128, t2);
|
||||
r1 = _mm512_permutex2var_epi32(t1, idx_lo_128, t3);
|
||||
r2 = _mm512_permutex2var_epi32(t4, idx_lo_128, t6);
|
||||
r3 = _mm512_permutex2var_epi32(t5, idx_lo_128, t7);
|
||||
r4 = _mm512_permutex2var_epi32(t0, idx_hi_128, t2);
|
||||
r5 = _mm512_permutex2var_epi32(t1, idx_hi_128, t3);
|
||||
r6 = _mm512_permutex2var_epi32(t4, idx_hi_128, t6);
|
||||
r7 = _mm512_permutex2var_epi32(t5, idx_hi_128, t7);
|
||||
|
||||
t0 = _mm512_permutex2var_epi32(r0, idx_lo_256, r2);
|
||||
t1 = _mm512_permutex2var_epi32(r1, idx_lo_256, r3);
|
||||
t2 = _mm512_permutex2var_epi32(r4, idx_lo_256, r6);
|
||||
t3 = _mm512_permutex2var_epi32(r5, idx_lo_256, r7);
|
||||
t4 = _mm512_permutex2var_epi32(r0, idx_hi_256, r2);
|
||||
t5 = _mm512_permutex2var_epi32(r1, idx_hi_256, r3);
|
||||
|
||||
STORE_VEC(0, 0, t0); STORE_VEC(0, 1, t1); STORE_VEC(0, 2, t2);
|
||||
STORE_VEC(1, 0, t3); STORE_VEC(1, 1, t4); STORE_VEC(1, 2, t5);
|
||||
boffset0 += 32;
|
||||
boffset1 += 32;
|
||||
}
|
||||
for (; i < m2; i += 2) {
|
||||
__m512i r0, r1, t0, t1;
|
||||
r0 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 0)*lda + j]);
|
||||
r1 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 1)*lda + j]);
|
||||
t0 = _mm512_unpacklo_epi16(r0, r1);
|
||||
t1 = _mm512_unpackhi_epi16(r0, r1);
|
||||
STORE_VEC(0, 0, _mm512_extracti32x4_epi32(t0, 0));
|
||||
STORE_VEC(0, 1, _mm512_extracti32x4_epi32(t1, 0));
|
||||
STORE_VEC(0, 2, _mm512_extracti32x4_epi32(t0, 1));
|
||||
STORE_VEC(1, 0, _mm512_extracti32x4_epi32(t1, 1));
|
||||
STORE_VEC(1, 1, _mm512_extracti32x4_epi32(t0, 2));
|
||||
STORE_VEC(1, 2, _mm512_extracti32x4_epi32(t1, 2));
|
||||
boffset0 += 8;
|
||||
boffset1 += 8;
|
||||
}
|
||||
for (; i < m; i++) {
|
||||
*(uint64_t *)(boffset0 + blk_size * 0) = *(uint64_t *)&a[i * lda + j + 0];
|
||||
*(uint64_t *)(boffset0 + blk_size * 1) = *(uint64_t *)&a[i * lda + j + 4];
|
||||
*(uint64_t *)(boffset0 + blk_size * 2) = *(uint64_t *)&a[i * lda + j + 8];
|
||||
*(uint64_t *)(boffset1 + blk_size * 0) = *(uint64_t *)&a[i * lda + j + 12];
|
||||
*(uint64_t *)(boffset1 + blk_size * 1) = *(uint64_t *)&a[i * lda + j + 16];
|
||||
*(uint64_t *)(boffset1 + blk_size * 2) = *(uint64_t *)&a[i * lda + j + 20];
|
||||
boffset0 += 4;
|
||||
boffset1 += 4;
|
||||
}
|
||||
boffset0 += stride * 2;
|
||||
}
|
||||
}
|
||||
|
||||
for (; j < n8; j += 8) {
|
||||
boffset1 = boffset0 + m * 4;
|
||||
for (i = 0; i < m4; i += 4) {
|
||||
__m128i a0 = _mm_loadu_si128((void *)&a[(i + 0)*lda + j]);
|
||||
__m128i a1 = _mm_loadu_si128((void *)&a[(i + 1)*lda + j]);
|
||||
__m128i a2 = _mm_loadu_si128((void *)&a[(i + 2)*lda + j]);
|
||||
__m128i a3 = _mm_loadu_si128((void *)&a[(i + 3)*lda + j]);
|
||||
__m128i a00 = _mm_unpacklo_epi16(a0, a1);
|
||||
__m128i a01 = _mm_unpackhi_epi16(a0, a1);
|
||||
__m128i a10 = _mm_unpacklo_epi16(a2, a3);
|
||||
__m128i a11 = _mm_unpackhi_epi16(a2, a3);
|
||||
_mm_storeu_si128((void *)(boffset0 + 0), a00);
|
||||
_mm_storeu_si128((void *)(boffset0 + 8), a10);
|
||||
_mm_storeu_si128((void *)(boffset1 + 0), a01);
|
||||
_mm_storeu_si128((void *)(boffset1 + 8), a11);
|
||||
boffset0 += 16;
|
||||
boffset1 += 16;
|
||||
}
|
||||
for (; i < m2; i+= 2) {
|
||||
__m128i a0 = _mm_loadu_si128((void *)&a[(i + 0)*lda + j]);
|
||||
__m128i a1 = _mm_loadu_si128((void *)&a[(i + 1)*lda + j]);
|
||||
__m128i a00 = _mm_unpacklo_epi16(a0, a1);
|
||||
__m128i a01 = _mm_unpackhi_epi16(a0, a1);
|
||||
_mm_storeu_si128((void *)(boffset0 + 0), a00);
|
||||
_mm_storeu_si128((void *)(boffset1 + 0), a01);
|
||||
boffset0 += 8;
|
||||
boffset1 += 8;
|
||||
}
|
||||
for (; i < m; i++) {
|
||||
__m128d a0 = _mm_loadu_pd((void *)&a[(i + 0)*lda + j]);
|
||||
_mm_store_sd((void *)boffset0, a0);
|
||||
_mm_store_sd((void *)boffset1, _mm_permute_pd(a0, 0x1));
|
||||
boffset0 += 4;
|
||||
boffset1 += 4;
|
||||
}
|
||||
boffset0 = boffset1;
|
||||
}
|
||||
if (j < n) {
|
||||
uint32_t remains = n - j;
|
||||
__mmask8 r_mask = (1UL << remains) - 1;
|
||||
if (remains > 4) {
|
||||
boffset1 = boffset0 + m * 4;
|
||||
uint32_t tail1 = remains - 4;
|
||||
__mmask8 w_mask1 = (1UL << tail1) - 1;
|
||||
for (i = 0; i < m2; i += 2) {
|
||||
__m128i a0 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]);
|
||||
__m128i a1 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]);
|
||||
__m128i a00 = _mm_unpacklo_epi16(a0, a1);
|
||||
__m128i a01 = _mm_unpackhi_epi16(a0, a1);
|
||||
_mm_storeu_si128((void *)boffset0, a00);
|
||||
_mm_mask_storeu_epi32((void *)boffset1, w_mask1, a01);
|
||||
boffset0 += 8;
|
||||
boffset1 += 2 * tail1;
|
||||
}
|
||||
for (; i < m; i++) {
|
||||
__m128i a0 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]);
|
||||
_mm_store_sd((void *)boffset0, (__m128d) a0);
|
||||
_mm_mask_storeu_epi16((void *)boffset1, w_mask1, (__m128i) _mm_permute_pd((__m128d) a0, 0x1));
|
||||
boffset0 += 4;
|
||||
boffset1 += tail1;
|
||||
}
|
||||
} else {
|
||||
for (i = 0; i < m2; i += 2) {
|
||||
__m128i a0 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]);
|
||||
__m128i a1 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]);
|
||||
__m128i a00 = _mm_unpacklo_epi16(a0, a1);
|
||||
_mm_mask_storeu_epi32((void *)boffset0, r_mask, a00);
|
||||
boffset0 += 2 * remains;
|
||||
}
|
||||
for (; i < m; i++) {
|
||||
__m128i a0 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]);
|
||||
_mm_mask_storeu_epi16((void *)boffset0, r_mask, a0);
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
|
@ -30,6 +30,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
// Include common macros for BF16 based operations with IA intrinsics
|
||||
#include "bf16_common_macros.h"
|
||||
|
||||
#undef STORE16_COMPLETE_RESULT
|
||||
#undef STORE16_MASK_COMPLETE_RESULT
|
||||
#undef STORE8_COMPLETE_RESULT
|
||||
#undef STORE8_MASK_COMPLETE_RESULT
|
||||
#undef STORE4_COMPLETE_RESULT
|
||||
#undef STORE4_MASK_COMPLETE_RESULT
|
||||
|
||||
#ifndef ZERO_BETA // Beta is non-zero
|
||||
|
||||
#ifndef ONE_BETA // BETA is not ONE
|
||||
|
@ -103,7 +110,9 @@ static int sbgemv_kernel_32xN_lda_direct(BLASLONG m, BLASLONG n, float alpha, bf
|
|||
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
|
||||
#endif
|
||||
#ifndef ZERO_BETA
|
||||
#ifndef ONE_BETA
|
||||
__m512 BETAVECTOR = _mm512_set1_ps(beta);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
__m512i matrixArray_seed_0, matrixArray_seed_1, matrixArray_seed_2, matrixArray_seed_3;
|
||||
|
@ -202,7 +211,7 @@ static int sbgemv_kernel_32xN_lda_direct(BLASLONG m, BLASLONG n, float alpha, bf
|
|||
unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(m&31)));
|
||||
__mmask32 tail_mask = *((__mmask32*) &tail_mask_value);
|
||||
|
||||
unsigned short store_tail_mask_value = (((unsigned int)0xffff) >> (16-(m&15)));
|
||||
unsigned int store_tail_mask_value = (((unsigned int)0xffff) >> (16-(m&15)));
|
||||
__mmask32 store_tail_mask = *((__mmask32*) &store_tail_mask_value);
|
||||
|
||||
accum512_0 = _mm512_setzero_ps();
|
||||
|
|
|
@ -29,6 +29,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
// Include common macros for BF16 based operations with IA intrinsics
|
||||
#include "bf16_common_macros.h"
|
||||
|
||||
#undef STORE16_COMPLETE_RESULT
|
||||
#undef STORE16_MASK_COMPLETE_RESULT
|
||||
#undef STORE8_COMPLETE_RESULT
|
||||
#undef STORE8_MASK_COMPLETE_RESULT
|
||||
#undef STORE4_COMPLETE_RESULT
|
||||
#undef STORE4_MASK_COMPLETE_RESULT
|
||||
|
||||
#ifndef ZERO_BETA // Beta is non-zero
|
||||
|
||||
#ifndef ONE_BETA // BETA is not ONE
|
||||
|
@ -231,7 +238,9 @@ static int sbgemv_kernel_32x2(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
|
|||
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
|
||||
#endif
|
||||
#ifndef ZERO_BETA
|
||||
#ifndef ONE_BETA
|
||||
__m512 BETAVECTOR = _mm512_set1_ps(beta);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
unsigned char load_mask_value = (((unsigned char)0xff) >> 6);
|
||||
|
@ -280,7 +289,7 @@ static int sbgemv_kernel_32x2(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
|
|||
} else if (tail_num == 8) {
|
||||
__m256 result256 = _mm256_setzero_ps();
|
||||
|
||||
__m256i matrixArray256 = _mm256_loadu_si256(&a[(tag_m_32x)*2]); // Load 8 rows with n=2
|
||||
__m256i matrixArray256 = _mm256_loadu_si256((__m256i *)&a[(tag_m_32x)*2]); // Load 8 rows with n=2
|
||||
__m256i xArray256 = _mm512_castsi512_si256(xArray);
|
||||
result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) xArray256);
|
||||
|
||||
|
@ -323,7 +332,9 @@ static int sbgemv_kernel_32x3(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
|
|||
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
|
||||
#endif
|
||||
#ifndef ZERO_BETA
|
||||
#ifndef ONE_BETA
|
||||
__m512 BETAVECTOR = _mm512_set1_ps(beta);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
unsigned char x_load_mask_value = (((unsigned char)0xff) >> 5);
|
||||
|
@ -395,9 +406,9 @@ static int sbgemv_kernel_32x3(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
|
|||
result256_0 = _mm256_setzero_ps();
|
||||
result256_1 = _mm256_setzero_ps();
|
||||
|
||||
matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element
|
||||
matrixArray256_1 = _mm256_loadu_si256(&a[((tag_m_32x+5)*3 + 1)]); // Load 5 rows with n=3 plus 1 element
|
||||
matrixArray256_2 = _mm256_loadu_si256(&a[((tag_m_32x+10)*3 + 2)]); // Load 5 rows with n=3 plus 1 element
|
||||
matrixArray256_0 = _mm256_loadu_si256((__m256i *)&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element
|
||||
matrixArray256_1 = _mm256_loadu_si256((__m256i *)&a[((tag_m_32x+5)*3 + 1)]); // Load 5 rows with n=3 plus 1 element
|
||||
matrixArray256_2 = _mm256_loadu_si256((__m256i *)&a[((tag_m_32x+10)*3 + 2)]); // Load 5 rows with n=3 plus 1 element
|
||||
|
||||
matrixArray256_3 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx01_1st, matrixArray256_1); // Select the first 2 elements for each row
|
||||
matrixArray256_4 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx01_2nd, matrixArray256_2); // Select the first 2 elements for each row
|
||||
|
@ -423,8 +434,8 @@ static int sbgemv_kernel_32x3(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
|
|||
if (tail_num > 10) {
|
||||
unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-((tail_num-10-1)*3+1)));
|
||||
__mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
|
||||
matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element
|
||||
matrixArray256_1 = _mm256_loadu_si256(&a[((tag_m_32x+5)*3 + 1)]); // Load 5 rows with n=3 plus 1 element
|
||||
matrixArray256_0 = _mm256_loadu_si256((__m256i *)&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element
|
||||
matrixArray256_1 = _mm256_loadu_si256((__m256i *)&a[((tag_m_32x+5)*3 + 1)]); // Load 5 rows with n=3 plus 1 element
|
||||
matrixArray256_2 = _mm256_maskz_loadu_epi16(tail_mask, &a[((tag_m_32x+10)*3 + 2)]); // Load m-tag_m_32x-10 rows
|
||||
|
||||
matrixArray256_3 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx01_1st, matrixArray256_1); // Select the first 2 elements for each row
|
||||
|
@ -439,7 +450,7 @@ static int sbgemv_kernel_32x3(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
|
|||
} else if (tail_num > 5) {
|
||||
unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-((tail_num-5-1)*3+2)));
|
||||
__mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
|
||||
matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element
|
||||
matrixArray256_0 = _mm256_loadu_si256((__m256i *)&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element
|
||||
matrixArray256_1 = _mm256_maskz_loadu_epi16(tail_mask, &a[((tag_m_32x+5)*3+1)]); // Load m-tag_m_32x-5 rows
|
||||
matrixArray256_2 = _mm256_setzero_si256();
|
||||
|
||||
|
@ -499,7 +510,9 @@ static int sbgemv_kernel_16x4(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
|
|||
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
|
||||
#endif
|
||||
#ifndef ZERO_BETA
|
||||
#ifndef ONE_BETA
|
||||
__m512 BETAVECTOR = _mm512_set1_ps(beta);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
__m512i M512_EPI32_1 = _mm512_set1_epi32(1);
|
||||
|
@ -591,7 +604,9 @@ static int sbgemv_kernel_30x5(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
|
|||
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
|
||||
#endif
|
||||
#ifndef ZERO_BETA
|
||||
#ifndef ONE_BETA
|
||||
__m512 BETAVECTOR = _mm512_set1_ps(beta);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
__m512 result_0, result_1;
|
||||
|
@ -782,7 +797,9 @@ static int sbgemv_kernel_16x6(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
|
|||
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
|
||||
#endif
|
||||
#ifndef ZERO_BETA
|
||||
#ifndef ONE_BETA
|
||||
__m512 BETAVECTOR = _mm512_set1_ps(beta);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
__m512i M512_EPI32_1 = _mm512_set1_epi32(1);
|
||||
|
@ -866,9 +883,9 @@ static int sbgemv_kernel_16x6(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
|
|||
|
||||
result256_0 = _mm256_setzero_ps();
|
||||
|
||||
matrixArray_0 = _mm256_loadu_si256(&a[(tag_m_16x)*6]); // Load 2 rows with n=6 plus 4 element
|
||||
matrixArray_1 = _mm256_loadu_si256(&a[((tag_m_16x+2)*6 + 4)]); // Load 2 rows with n=6 plus 4 element
|
||||
matrixArray_2 = _mm256_loadu_si256(&a[((tag_m_16x+5)*6 + 2)]); // Load 2 rows with n=6 plus 4 element
|
||||
matrixArray_0 = _mm256_loadu_si256((__m256i *)&a[(tag_m_16x)*6]); // Load 2 rows with n=6 plus 4 element
|
||||
matrixArray_1 = _mm256_loadu_si256((__m256i *)&a[((tag_m_16x+2)*6 + 4)]); // Load 2 rows with n=6 plus 4 element
|
||||
matrixArray_2 = _mm256_loadu_si256((__m256i *)&a[((tag_m_16x+5)*6 + 2)]); // Load 2 rows with n=6 plus 4 element
|
||||
|
||||
// Process the 0|1 elements
|
||||
// Select the 0|1 elements for each row
|
||||
|
@ -957,7 +974,9 @@ static int sbgemv_kernel_16x7(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
|
|||
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
|
||||
#endif
|
||||
#ifndef ZERO_BETA
|
||||
#ifndef ONE_BETA
|
||||
__m512 BETAVECTOR = _mm512_set1_ps(beta);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
__m512i M512_EPI32_2 = _mm512_set1_epi32(2);
|
||||
|
@ -1110,7 +1129,7 @@ static int sbgemv_kernel_16x8(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
|
|||
{
|
||||
BLASLONG tag_m_16x = m & (~15);
|
||||
|
||||
__m128i x128 = _mm_loadu_si128(x); // |x0|x1|x2|x3|x4|x5|x6|x7|
|
||||
__m128i x128 = _mm_loadu_si128((__m128i *)x); // |x0|x1|x2|x3|x4|x5|x6|x7|
|
||||
|
||||
if (tag_m_16x > 0) {
|
||||
__m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3;
|
||||
|
@ -1122,7 +1141,9 @@ static int sbgemv_kernel_16x8(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
|
|||
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
|
||||
#endif
|
||||
#ifndef ZERO_BETA
|
||||
#ifndef ONE_BETA
|
||||
__m512 BETAVECTOR = _mm512_set1_ps(beta);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
__m512i M512_EPI32_2 = _mm512_set1_epi32(2);
|
||||
|
@ -1214,7 +1235,7 @@ static int sbgemv_kernel_16x8(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
|
|||
__m128 result128, tmp128;
|
||||
for (BLASLONG i = tag_m_16x; i < m; i++) {
|
||||
result128 = _mm_setzero_ps();
|
||||
matrixArray128 = _mm_loadu_si128(&a[(i)*8]); // Load 1 rows with n=8
|
||||
matrixArray128 = _mm_loadu_si128((__m128i *)&a[(i)*8]); // Load 1 rows with n=8
|
||||
result128 = _mm_dpbf16_ps(result128, (__m128bh) matrixArray128, (__m128bh) x128);
|
||||
tmp128 = _mm_shuffle_ps(result128, result128, 14);
|
||||
result128 = _mm_add_ps(result128, tmp128);
|
||||
|
@ -1258,7 +1279,7 @@ static int sbgemv_kernel_14x9(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
|
|||
|
||||
unsigned char x_load_mask_value = (((unsigned char)0xff) >> 7);
|
||||
__mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
|
||||
__m128i x128_0 = _mm_loadu_si128(x); // |x0|x1|x2|x3|x4|x5|x6|x7|
|
||||
__m128i x128_0 = _mm_loadu_si128((__m128i *)x); // |x0|x1|x2|x3|x4|x5|x6|x7|
|
||||
__m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8)); // |x8|0 |0 | 0| 0| 0| 0| 0|
|
||||
|
||||
if (tag_m_14x > 0) {
|
||||
|
@ -1271,7 +1292,9 @@ static int sbgemv_kernel_14x9(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
|
|||
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
|
||||
#endif
|
||||
#ifndef ZERO_BETA
|
||||
#ifndef ONE_BETA
|
||||
__m512 BETAVECTOR = _mm512_set1_ps(beta);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
__m256i M256_EPI16_2 = _mm256_set1_epi16(2);
|
||||
|
@ -1390,7 +1413,7 @@ static int sbgemv_kernel_12x10(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
|
|||
|
||||
unsigned char x_load_mask_value = (((unsigned char)0xf) >> 3);
|
||||
__mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
|
||||
__m128i x128_0 = _mm_loadu_si128(x); // |x0|x1|x2|x3|x4|x5|x6|x7|
|
||||
__m128i x128_0 = _mm_loadu_si128((__m128i *)x); // |x0|x1|x2|x3|x4|x5|x6|x7|
|
||||
__m128i x128_1 = _mm_maskz_loadu_epi32(x_load_mask, (x+8)); // |x8|x9|0 | 0| 0| 0| 0| 0|
|
||||
|
||||
if (tag_m_12x > 0) {
|
||||
|
@ -1403,7 +1426,9 @@ static int sbgemv_kernel_12x10(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
|
|||
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
|
||||
#endif
|
||||
#ifndef ZERO_BETA
|
||||
#ifndef ONE_BETA
|
||||
__m512 BETAVECTOR = _mm512_set1_ps(beta);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
__m256i M256_EPI32_1 = _mm256_set1_epi32(1);
|
||||
|
@ -1522,7 +1547,7 @@ static int sbgemv_kernel_15x11(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
|
|||
|
||||
unsigned char x_load_mask_value = (((unsigned char)0xff) >> 5);
|
||||
__mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
|
||||
__m128i x128_0 = _mm_loadu_si128(x); // |x0|x1| x2|x3|x4|x5|x6|x7|
|
||||
__m128i x128_0 = _mm_loadu_si128((__m128i *)x); // |x0|x1| x2|x3|x4|x5|x6|x7|
|
||||
__m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8)); // |x8|x9|x10| 0| 0| 0| 0| 0|
|
||||
|
||||
if (tag_m_15x > 0) {
|
||||
|
@ -1535,7 +1560,9 @@ static int sbgemv_kernel_15x11(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
|
|||
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
|
||||
#endif
|
||||
#ifndef ZERO_BETA
|
||||
#ifndef ONE_BETA
|
||||
__m512 BETAVECTOR = _mm512_set1_ps(beta);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
__m512i idx_stage1_base_0, idx_stage1_base_1, idx_stage1_base_2, idx_stage1_base_3, idx_stage1_base_4, idx_stage1_base_5;
|
||||
|
@ -1690,7 +1717,7 @@ static int sbgemv_kernel_15x12(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
|
|||
|
||||
unsigned char x_load_mask_value = (((unsigned char)0xff) >> 4);
|
||||
__mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
|
||||
__m128i x128_0 = _mm_loadu_si128(x); // |x0|x1| x2| x3|x4|x5|x6|x7|
|
||||
__m128i x128_0 = _mm_loadu_si128((__m128i *)x); // |x0|x1| x2| x3|x4|x5|x6|x7|
|
||||
__m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8)); // |x8|x9|x10|x11| 0| 0| 0| 0|
|
||||
|
||||
if (tag_m_15x > 0) {
|
||||
|
@ -1703,7 +1730,9 @@ static int sbgemv_kernel_15x12(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
|
|||
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
|
||||
#endif
|
||||
#ifndef ZERO_BETA
|
||||
#ifndef ONE_BETA
|
||||
__m512 BETAVECTOR = _mm512_set1_ps(beta);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
__m512i idx_stage1_base_0, idx_stage1_base_1, idx_stage1_base_2, idx_stage1_base_3, idx_stage1_base_4, idx_stage1_base_5;
|
||||
|
@ -1873,16 +1902,15 @@ static int sbgemv_kernel_16x13(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
|
|||
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
|
||||
#endif
|
||||
#ifndef ZERO_BETA
|
||||
#ifndef ONE_BETA
|
||||
__m512 BETAVECTOR = _mm512_set1_ps(beta);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
__m512i M512_EPI32_4 = _mm512_set1_epi32(4);
|
||||
__m512i idx_base_0 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0);
|
||||
__m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_4);
|
||||
|
||||
unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 6);
|
||||
__mmask32 load_mask = *((__mmask32*) &load_mask_value);
|
||||
|
||||
// Prepare X with 2-step interleave way
|
||||
xArray_0 = _mm512_inserti32x8(_mm512_castsi256_si512(x256), x256, 0x1);
|
||||
BF16_INTERLEAVE_1x32(xArray)
|
||||
|
@ -2045,7 +2073,9 @@ static int sbgemv_kernel_16x14(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
|
|||
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
|
||||
#endif
|
||||
#ifndef ZERO_BETA
|
||||
#ifndef ONE_BETA
|
||||
__m512 BETAVECTOR = _mm512_set1_ps(beta);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
__m512i M512_EPI32_4 = _mm512_set1_epi32(4);
|
||||
|
@ -2207,16 +2237,15 @@ static int sbgemv_kernel_16x15(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
|
|||
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
|
||||
#endif
|
||||
#ifndef ZERO_BETA
|
||||
#ifndef ONE_BETA
|
||||
__m512 BETAVECTOR = _mm512_set1_ps(beta);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
__m512i M512_EPI32_4 = _mm512_set1_epi32(4);
|
||||
__m512i idx_base_0 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0);
|
||||
__m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_4);
|
||||
|
||||
unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 2);
|
||||
__mmask32 load_mask = *((__mmask32*) &load_mask_value);
|
||||
|
||||
// Prepare X with 2-step interleave way
|
||||
xArray_0 = _mm512_inserti32x8(_mm512_castsi256_si512(x256), x256, 0x1);
|
||||
BF16_INTERLEAVE_1x32(xArray)
|
||||
|
@ -2364,7 +2393,7 @@ static int sbgemv_kernel_16x16(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
|
|||
{
|
||||
BLASLONG tag_m_16x = m & (~15);
|
||||
|
||||
__m256i x256 = _mm256_loadu_si256(x); // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|x13|x14|x15|
|
||||
__m256i x256 = _mm256_loadu_si256((__m256i *)x); // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|x13|x14|x15|
|
||||
|
||||
if (tag_m_16x > 0) {
|
||||
__m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7, \
|
||||
|
@ -2377,7 +2406,9 @@ static int sbgemv_kernel_16x16(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
|
|||
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
|
||||
#endif
|
||||
#ifndef ZERO_BETA
|
||||
#ifndef ONE_BETA
|
||||
__m512 BETAVECTOR = _mm512_set1_ps(beta);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
__m512i M512_EPI32_4 = _mm512_set1_epi32(4);
|
||||
|
@ -2484,7 +2515,7 @@ static int sbgemv_kernel_16x16(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
|
|||
__m128 accum128, tmp128;
|
||||
for (BLASLONG i = tag_m_16x; i < m; i++) {
|
||||
accum256 = _mm256_setzero_ps();
|
||||
matrixArray256 = _mm256_loadu_si256(&a[(i)*16]); // Load 1 rows with n=16
|
||||
matrixArray256 = _mm256_loadu_si256((__m256i *)&a[(i)*16]); // Load 1 rows with n=16
|
||||
accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) matrixArray256, (__m256bh) x256);
|
||||
accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1));
|
||||
tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e);
|
||||
|
@ -2535,7 +2566,9 @@ static int sbgemv_kernel_8x16p_lda(BLASLONG m, BLASLONG n, float alpha, bfloat16
|
|||
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
|
||||
#endif
|
||||
#ifndef ZERO_BETA
|
||||
#ifndef ONE_BETA
|
||||
__m512 BETAVECTOR = _mm512_set1_ps(beta);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
__m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7, \
|
||||
|
@ -2647,8 +2680,6 @@ static int sbgemv_kernel_1x128_lda_direct(BLASLONG m, BLASLONG n, float alpha, b
|
|||
BLASLONG tag_n_32x = n & (~31);
|
||||
BLASLONG tag_n_128x = n & (~127);
|
||||
|
||||
__m512 accum512_0, accum512_1, accum512_2, accum512_3, accum512_4, accum512_5, accum512_6, accum512_7, \
|
||||
accum512_8, accum512_9, accum512_10, accum512_11, accum512_12, accum512_13, accum512_14, accum512_15;
|
||||
__m512 accum512_bridge[8];
|
||||
__m512 accum512_t_0, accum512_t_1, accum512_t_2, accum512_t_3;
|
||||
__m256 accum256_0;
|
||||
|
@ -2658,7 +2689,9 @@ static int sbgemv_kernel_1x128_lda_direct(BLASLONG m, BLASLONG n, float alpha, b
|
|||
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
|
||||
#endif
|
||||
#ifndef ZERO_BETA
|
||||
#ifndef ONE_BETA
|
||||
__m512 BETAVECTOR = _mm512_set1_ps(beta);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
__m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3;
|
||||
|
@ -2825,7 +2858,9 @@ static int sbgemv_kernel_8x32_lda_direct(BLASLONG m, BLASLONG n, float alpha, bf
|
|||
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
|
||||
#endif
|
||||
#ifndef ZERO_BETA
|
||||
#ifndef ONE_BETA
|
||||
__m512 BETAVECTOR = _mm512_set1_ps(beta);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
__m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7;
|
||||
|
@ -2961,7 +2996,9 @@ static int sbgemv_kernel_8x16m_lda(BLASLONG m, BLASLONG n, float alpha, bfloat16
|
|||
__m512 ALPHAVECTOR = _mm512_castps256_ps512(_mm256_set1_ps(alpha));
|
||||
#endif
|
||||
#ifndef ZERO_BETA
|
||||
#ifndef ONE_BETA
|
||||
__m512 BETAVECTOR = _mm512_castps256_ps512(_mm256_set1_ps(beta));
|
||||
#endif
|
||||
#endif
|
||||
|
||||
__m256 accum256_0, accum256_1, accum256_2, accum256_3, accum256_4, accum256_5, accum256_6, accum256_7, \
|
||||
|
@ -3012,7 +3049,7 @@ static int sbgemv_kernel_8x16m_lda(BLASLONG m, BLASLONG n, float alpha, bfloat16
|
|||
__m128 accum128, tmp128;
|
||||
for (BLASLONG i = tag_m_8x; i < m; i++) {
|
||||
accum256_0 = _mm256_setzero_ps();
|
||||
matrixArray_0 = _mm256_loadu_si256(&a[(i)*lda]); // Load 1 rows with n=16
|
||||
matrixArray_0 = _mm256_loadu_si256((__m256i *)&a[(i)*lda]); // Load 1 rows with n=16
|
||||
accum256_0 = _mm256_dpbf16_ps(accum256_0, (__m256bh) matrixArray_0, (__m256bh) xArray256);
|
||||
accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1));
|
||||
tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e);
|
||||
|
|
|
@ -41,7 +41,7 @@
|
|||
#include <immintrin.h>
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
|
||||
FLOAT *dummy2, BLASLONG dummy3, FLOAT *dummy4, BLASLONG dummy5,
|
||||
IFLOAT *dummy2, BLASLONG dummy3, IFLOAT *dummy4, BLASLONG dummy5,
|
||||
FLOAT *c, BLASLONG ldc){
|
||||
|
||||
BLASLONG i, j;
|
||||
|
|
|
@ -115,6 +115,8 @@ static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT
|
|||
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_SGEMV_N_SKYLAKE_KERNEL
|
||||
|
||||
#ifndef HAVE_KERNEL_4x2
|
||||
|
||||
static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
|
||||
|
@ -170,6 +172,7 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_KERNEL_4x1
|
||||
|
|
|
@ -16,7 +16,7 @@ static FLOAT zasum_kernel(BLASLONG n, FLOAT *x)
|
|||
|
||||
if (n2 < 32) {
|
||||
__m128d accum_10, accum_11, accum_12, accum_13;
|
||||
__m128d abs_mask1;
|
||||
__m128d abs_mask1 = abs_mask1;
|
||||
|
||||
accum_10 = _mm_setzero_pd();
|
||||
accum_11 = _mm_setzero_pd();
|
||||
|
|
|
@ -351,7 +351,7 @@
|
|||
*
|
||||
* Quick return if possible
|
||||
*
|
||||
IF( N.LE.0 ) THEN
|
||||
IF( (N.LE.0) .OR. (M.LE.0) ) THEN
|
||||
RETURN
|
||||
END IF
|
||||
*
|
||||
|
|
|
@ -353,7 +353,7 @@
|
|||
*
|
||||
* Quick return if possible
|
||||
*
|
||||
IF( N.LE.0 ) THEN
|
||||
IF( (N.LE.0).OR.(M.LE.0) ) THEN
|
||||
RETURN
|
||||
END IF
|
||||
*
|
||||
|
|
|
@ -353,7 +353,7 @@
|
|||
*
|
||||
* Quick return if possible
|
||||
*
|
||||
IF( N.LE.0 ) THEN
|
||||
IF( (N.LE.0).OR.(M.LE.0) ) THEN
|
||||
RETURN
|
||||
END IF
|
||||
*
|
||||
|
|
|
@ -351,7 +351,7 @@
|
|||
*
|
||||
* Quick return if possible
|
||||
*
|
||||
IF( N.LE.0 ) THEN
|
||||
IF( (N.LE.0).OR.(M.LE.0) ) THEN
|
||||
RETURN
|
||||
END IF
|
||||
*
|
||||
|
|
16
param.h
16
param.h
|
@ -1771,6 +1771,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
#define USE_SGEMM_KERNEL_DIRECT 1
|
||||
|
||||
#undef SBGEMM_DEFAULT_UNROLL_N
|
||||
#undef SBGEMM_DEFAULT_UNROLL_M
|
||||
#undef SBGEMM_DEFAULT_P
|
||||
#undef SBGEMM_DEFAULT_R
|
||||
#undef SBGEMM_DEFAULT_Q
|
||||
#define SBGEMM_DEFAULT_UNROLL_N 4
|
||||
#define SBGEMM_DEFAULT_UNROLL_M 16
|
||||
#define SBGEMM_DEFAULT_P 384
|
||||
#define SBGEMM_DEFAULT_Q 768
|
||||
#define SBGEMM_DEFAULT_R sbgemm_r
|
||||
|
||||
#ifdef ARCH_X86
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 4
|
||||
|
@ -2454,13 +2465,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 16
|
||||
#define SGEMM_DEFAULT_UNROLL_N 8
|
||||
#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
#define DGEMM_DEFAULT_UNROLL_M 16
|
||||
#define DGEMM_DEFAULT_UNROLL_N 4
|
||||
#else
|
||||
#define DGEMM_DEFAULT_UNROLL_M 8
|
||||
#define DGEMM_DEFAULT_UNROLL_N 8
|
||||
#endif
|
||||
#define CGEMM_DEFAULT_UNROLL_M 8
|
||||
#define CGEMM_DEFAULT_UNROLL_N 4
|
||||
#define ZGEMM_DEFAULT_UNROLL_M 8
|
||||
|
|
Loading…
Reference in New Issue