Merge branch 'xianyi:develop' into azure-mingw-make

This commit is contained in:
Martin Kroeker 2021-10-06 18:23:36 +02:00 committed by GitHub
commit f54fa15cdd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
86 changed files with 2772 additions and 307 deletions

View File

@ -1,33 +1,38 @@
# XXX: Precise is already deprecated, new default is Trusty.
# https://blog.travis-ci.com/2017-07-11-trusty-as-default-linux-is-coming
dist: precise
dist: focal
sudo: true
language: c
matrix:
include:
- &test-ubuntu
os: linux
# os: linux
compiler: gcc
addons:
apt:
packages:
- gfortran
# before_script: &common-before
# - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
# script:
# - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
# - make -C test $COMMON_FLAGS $BTYPE
# - make -C ctest $COMMON_FLAGS $BTYPE
# - make -C utest $COMMON_FLAGS $BTYPE
# env:
# - TARGET_BOX=LINUX64
# - BTYPE="BINARY=64"
#
# - <<: *test-ubuntu
os: linux-ppc64le
before_script: &common-before
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32"
script:
- make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
- make -C test $COMMON_FLAGS $BTYPE
- make -C ctest $COMMON_FLAGS $BTYPE
- make -C utest $COMMON_FLAGS $BTYPE
env:
- TARGET_BOX=LINUX64
- BTYPE="BINARY=64"
- <<: *test-ubuntu
os: linux-ppc64le
before_script:
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32"
env:
# for matrix annotation only
- TARGET_BOX=PPC64LE_LINUX

View File

@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
project(OpenBLAS C ASM)
set(OpenBLAS_MAJOR_VERSION 0)
set(OpenBLAS_MINOR_VERSION 3)
set(OpenBLAS_PATCH_VERSION 17.dev)
set(OpenBLAS_PATCH_VERSION 18.dev)
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
# Adhere to GNU filesystem layout conventions
@ -132,7 +132,7 @@ endif ()
if (BUILD_BFLOAT16)
message(STATUS "Building Half Precision")
list(APPEND FLOAT_TYPES "BFLOAT16") # defines nothing
# list(APPEND FLOAT_TYPES "BFLOAT16") # defines nothing
endif ()
if (NOT DEFINED CORE OR "${CORE}" STREQUAL "UNKNOWN")

View File

@ -1,4 +1,47 @@
OpenBLAS ChangeLog
====================================================================
Version 0.3.18
02-Oct-2021
general:
- when the build-time number of preconfigured threads is exceeded
at runtime (typically by an external program calling BLAS functions
from a larger number of threads in parallel), OpenBLAS will now
allocate an auxiliary control structure for up to 512 additional
threads instead of aborting
- added support for Loongson's LoongArch64 cpu architecture
- fixed building OpenBLAS with CMAKE and -DBUILD_BFLOAT16=ON
- added support for building OpenBLAS as a CMAKE subproject
- added support for building for Windows/ARM64 targets with clang
- improved support for building with the IBM xlf compiler
- imported Reference-LAPACK PR 625 (out-of-bounds reads in ?LARRV)
- imported Reference-LAPACK PR 597 for testsuite compatibility with
LLVM's libomp
x86_64:
- added SkylakeX S/DGEMM kernels for small problem sizes (M*N*K<=1000000)
- added optimized SBGEMM for Intel Cooper Lake
- reinstated the performance patch for AVX512 SGEMV_T with a proper fix
- added a workaround for a gcc11 tree-vectorizer bug that caused spurious
failures in the test programs for complex BLAS3 when compiling at -O3
(the default for cmake "release" builds)
- added support for runtime cpu count detection under Haiku OS
- worked around a long-standing miscompilation issue of the Haswell DGEMV_T
kernel with gcc that could produce NaN output in some corner cases
POWER:
- improved performance of DASUM on POWER10
ARMV8:
- fixed crashes (use of reserved register x18) on Apple M1 under OSX
- fixed building with gcc releases earlier than 5.1
MIPS:
- fixed building under BSD
MIPS64:
- fixed building under BSD
====================================================================
Version 0.3.17
15-Jul-2021

View File

@ -269,7 +269,7 @@ prof_lapack : lapack_prebuild
lapack_prebuild :
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
-@echo "FC = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
-@echo "FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "override FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "FFLAGS_DRV = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc

View File

@ -12,9 +12,13 @@ endif
ifeq ($(CORE), POWER10)
ifneq ($(C_COMPILER), PGI)
CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
ifeq ($(F_COMPILER), IBM)
FCOMMON_OPT += -O2 -qrecur -qnosave
else
FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math
endif
endif
endif
ifeq ($(CORE), POWER9)
ifneq ($(C_COMPILER), PGI)
@ -33,7 +37,11 @@ else
CCOMMON_OPT += -fast -Mvect=simd -Mcache_align
endif
ifneq ($(F_COMPILER), PGI)
ifeq ($(F_COMPILER), IBM)
FCOMMON_OPT += -O2 -qrecur -qnosave
else
FCOMMON_OPT += -O2 -frecursive -fno-fast-math
endif
ifeq ($(C_COMPILER), GCC)
ifneq ($(GCCVERSIONGT4), 1)
$(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended)
@ -57,7 +65,11 @@ CCOMMON_OPT += -fast -Mvect=simd -Mcache_align
endif
ifneq ($(F_COMPILER), PGI)
ifeq ($(OSNAME), AIX)
ifeq ($(F_COMPILER), IBM)
FCOMMON_OPT += -O2 -qrecur -qnosave
else
FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math
endif
else
FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math
endif

View File

@ -3,7 +3,7 @@
#
# This library's version
VERSION = 0.3.17.dev
VERSION = 0.3.18.dev
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library

View File

@ -16,6 +16,8 @@ else
HOSTARCH = $(ARCH)
endif
HAVE_GAS := $(shell as -v < /dev/null 2>&1 | grep GNU 2>&1 >/dev/null)
# Catch conflicting usage of ARCH in some BSD environments
ifeq ($(ARCH), amd64)
override ARCH=x86_64
@ -33,6 +35,10 @@ else ifeq ($(ARCH), armv7)
override ARCH=arm
else ifeq ($(ARCH), aarch64)
override ARCH=arm64
else ifeq ($(ARCH), mipsel)
override ARCH=mips
else ifeq ($(ARCH), mips64el)
override ARCH=mips64
else ifeq ($(ARCH), zarch)
override ARCH=zarch
endif
@ -303,7 +309,7 @@ else
SMP = 1
endif
else
ifeq ($(NUM_THREAD), 1)
ifeq ($(NUM_THREADS), 1)
SMP =
else
SMP = 1

View File

@ -128,6 +128,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
- **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64.
- **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64.
- **Intel Skylake-X**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64.
- **Intel Cooper Lake**: as Skylake-X with improved BFLOAT16 support.
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
@ -153,6 +154,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
- **ARMv8**: Basic ARMV8 with small caches, optimized Level-3 and Level-2 BLAS
- **Cortex-A53**: same as ARMV8 (different cpu specifications)
- **Cortex-A55**: same as ARMV8 (different cpu specifications)
- **Cortex A57**: Optimized Level-3 and Level-2 functions
- **Cortex A72**: same as A57 ( different cpu specifications)
- **Cortex A73**: same as A57 (different cpu specifications)
@ -178,10 +180,11 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
#### RISC-V
- **C910V**: Optimized Leve-3 BLAS (real) and Level-1,2 by RISC-V Vector extension 0.7.1.
- **C910V**: Optimized Level-3 BLAS (real) and Level-1,2 by RISC-V Vector extension 0.7.1.
```sh
make HOSTCC=gcc TARGET=C910V CC=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran
```
(also known to work on C906)
### Support for multiple targets in a single library

View File

@ -19,7 +19,7 @@ jobs:
# of gcc / glibc
- job: manylinux1_gcc
pool:
vmImage: 'ubuntu-16.04'
vmImage: 'ubuntu-latest'
steps:
- script: |
echo "FROM quay.io/pypa/manylinux1_x86_64
@ -35,7 +35,7 @@ jobs:
displayName: Run manylinux1 docker build
- job: Intel_SDE_skx
pool:
vmImage: 'ubuntu-16.04'
vmImage: 'ubuntu-latest'
steps:
- script: |
# at the time of writing the available Azure Ubuntu vm image
@ -213,8 +213,9 @@ jobs:
vmImage: 'ubuntu-latest'
steps:
- script: |
wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \
&& echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1
wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.13.1/alpine-chroot-install \
&& echo '7c7e3fa378e69aecc7f5f01bbc759e5f0a9d9b74 alpine-chroot-install' | sha1sum -c \
|| exit 1
alpine() { /alpine/enter-chroot -u "$USER" "$@"; }
sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo'
alpine make DYNAMIC_ARCH=1 BINARY=64

View File

@ -104,7 +104,7 @@ endif ()
if (${F_COMPILER} STREQUAL "IBM")
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_IBM")
# FCOMMON_OPT += -qarch=440
set(FCOMMON_OPT "${FCOMMON_OPT} -qrecur")
if (BINARY64)
set(FCOMMON_OPT "${FCOMMON_OPT} -q64")
if (INTERFACE64)

View File

@ -134,6 +134,8 @@ if (BUILD_BFLOAT16)
set(SHSWAPKERNEL ../arm/swap.c)
set(TOBF16KERNEL ../x86_64/tobf16.c)
set(BF16TOKERNEL ../x86_64/bf16to.c)
set(SBGEMVNKERNEL ../x86_64/sbgemv_n.c)
set(SBGEMVTKERNEL ../x86_64/sbgemv_t.c)
endif ()
endmacro ()

View File

@ -469,6 +469,9 @@ endif()
if (BUILD_COMPLEX16)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX16")
endif()
if (BUILD_BFLOAT16)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_BFLOAT16")
endif()
if(NOT MSVC)
set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CCOMMON_OPT}")
endif()

View File

@ -26,10 +26,12 @@
*****************************************************************************/
#include <string.h>
#ifdef OS_DARWIN
#ifdef __APPLE__
#include <sys/sysctl.h>
int32_t value;
size_t length=sizeof(value);
int64_t value64;
size_t length64=sizeof(value64);
#endif
#define CPU_UNKNOWN 0
@ -212,9 +214,9 @@ int detect(void)
}
#else
#ifdef DARWIN
#ifdef __APPLE__
sysctlbyname("hw.cpufamily",&value,&length,NULL,0);
if (value ==131287967) return CPU_VORTEX;
if (value ==131287967|| value == 458787763 ) return CPU_VORTEX;
#endif
return CPU_ARMV8;
#endif
@ -265,7 +267,7 @@ int n=0;
printf("#define NUM_CORES %d\n",n);
#endif
#ifdef DARWIN
#ifdef __APPLE__
sysctlbyname("hw.physicalcpu_max",&value,&length,NULL,0);
printf("#define NUM_CORES %d\n",value);
#endif
@ -420,17 +422,19 @@ void get_cpuconfig(void)
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
printf("#define DTB_SIZE 4096 \n");
break;
#ifdef DARWIN
#ifdef __APPLE__
case CPU_VORTEX:
printf("#define VORTEX \n");
sysctlbyname("hw.l1icachesize",&value,&length,NULL,0);
printf("#define L1_CODE_SIZE %d \n",value);
sysctlbyname("hw.cachelinesize",&value,&length,NULL,0);
printf("#define L1_CODE_LINESIZE %d \n",value);
sysctlbyname("hw.l1dcachesize",&value,&length,NULL,0);
printf("#define L1_DATA_SIZE %d \n",value);
sysctlbyname("hw.l2dcachesize",&value,&length,NULL,0);
printf("#define L2_SIZE %d \n",value);
sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0);
printf("#define L1_CODE_SIZE %lld \n",value64);
sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0);
printf("#define L1_CODE_LINESIZE %lld \n",value64);
sysctlbyname("hw.l1dcachesize",&value64,&length64,NULL,0);
printf("#define L1_DATA_SIZE %lld \n",value64);
sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0);
printf("#define L2_SIZE %lld \n",value64);
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
printf("#define DTB_SIZE 4096 \n");
break;
#endif
}

View File

@ -81,6 +81,7 @@ foreach (float_type ${FLOAT_TYPES})
GenerateNamedObjects("gbmv_thread.c" "TRANSA" "gbmv_thread_t" false "" "" false ${float_type})
endif ()
# special defines for complex
if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX")
foreach (u_source ${U_SOURCES})
@ -197,6 +198,13 @@ foreach (float_type ${FLOAT_TYPES})
endif ()
endforeach ()
if (BUILD_BFLOAT16)
if (USE_THREAD)
GenerateNamedObjects("sbgemv_thread.c" "" "gemv_thread_n" false "" "" false "BFLOAT16")
GenerateNamedObjects("sbgemv_thread.c" "TRANSA" "gemv_thread_t" false "" "" false "BFLOAT16")
endif ()
endif ()
if ( BUILD_COMPLEX AND NOT BUILD_SINGLE)
if (USE_THREAD)
GenerateNamedObjects("gemv_thread.c" "" "gemv_thread_n" false "" "" false "SINGLE")

View File

@ -12,6 +12,12 @@ foreach (GEMM_DEFINE ${GEMM_DEFINES})
if (USE_THREAD AND NOT USE_SIMPLE_THREADED_LEVEL3)
GenerateNamedObjects("gemm.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm_thread_${GEMM_DEFINE_LC}" 0)
endif ()
if (BUILD_BFLOAT16)
GenerateNamedObjects("gemm.c" "${GEMM_DEFINE}" "gemm_${GEMM_DEFINE_LC}" 0 "" "" false "BFLOAT16")
if (USE_THREAD AND NOT USE_SIMPLE_THREADED_LEVEL3)
GenerateNamedObjects("gemm.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm_thread_${GEMM_DEFINE_LC}" 0 "" "" false "BFLOAT16")
endif ()
endif ()
endforeach ()
if ( BUILD_COMPLEX16 AND NOT BUILD_DOUBLE)

View File

@ -6,10 +6,6 @@ extern gotoblas_t gotoblas_POWER8;
#if (!defined __GNUC__) || ( __GNUC__ >= 6)
extern gotoblas_t gotoblas_POWER9;
#endif
//#if (!defined __GNUC__) || ( __GNUC__ >= 11) \
// || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2)
//#define HAVE_P10_SUPPORT 1
//#endif
#ifdef HAVE_P10_SUPPORT
extern gotoblas_t gotoblas_POWER10;
#endif

View File

@ -2695,7 +2695,7 @@ static volatile struct {
} memory[NUM_BUFFERS];
static volatile struct newmemstruct
struct newmemstruct
{
BLASULONG lock;
void *addr;

View File

@ -524,6 +524,9 @@ void blas_set_parameter(void){
xgemm_p = ((xgemm_p + XGEMM_UNROLL_M - 1)/XGEMM_UNROLL_M) * XGEMM_UNROLL_M;
#endif
#ifdef BUILD_BFLOAT16
sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q * 4)) - 15) & ~15;
#endif
sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15;
dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15;
cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15;
@ -629,7 +632,9 @@ void blas_set_parameter(void){
xgemm_p = 16 * (size + 1);
#endif
#ifdef BUILD_BFLOAT16
sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q * 4)) - 15) & ~15;
#endif
sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15;
dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15;
cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15;

View File

@ -313,6 +313,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#ifdef NO_AVX
#define SUBARCHITECTURE "NEHALEM"
#define ARCHCONFIG "-DNEHALEM " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2"
#define LIBNAME "nehalem"
#define CORENAME "NEHALEM"
#else
#define SUBARCHITECTURE "SANDYBRIDGE"
#define ARCHCONFIG "-DSANDYBRIDGE " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
@ -322,12 +332,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define LIBNAME "sandybridge"
#define CORENAME "SANDYBRIDGE"
#endif
#endif
#ifdef FORCE_HASWELL
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#ifdef NO_AVX2
#ifdef NO_AVX
#define SUBARCHITECTURE "NEHALEM"
#define ARCHCONFIG "-DNEHALEM " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2"
#define LIBNAME "nehalem"
#define CORENAME "NEHALEM"
#else
#define SUBARCHITECTURE "SANDYBRIDGE"
#define ARCHCONFIG "-DSANDYBRIDGE " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
@ -336,6 +357,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX"
#define LIBNAME "sandybridge"
#define CORENAME "SANDYBRIDGE"
#endif
#else
#define SUBARCHITECTURE "HASWELL"
#define ARCHCONFIG "-DHASWELL " \
@ -350,10 +372,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#ifdef FORCE_SKYLAKEX
#ifdef NO_AVX512
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#ifdef NO_AVX512
#ifdef NO_AVX2
#ifdef NO_AVX
#define SUBARCHITECTURE "NEHALEM"
#define ARCHCONFIG "-DNEHALEM " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2"
#define LIBNAME "nehalem"
#define CORENAME "NEHALEM"
#else
#define SUBARCHITECTURE "SANDYBRIDGE"
#define ARCHCONFIG "-DSANDYBRIDGE " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX"
#define LIBNAME "sandybridge"
#define CORENAME "SANDYBRIDGE"
#endif
#else
#define SUBARCHITECTURE "HASWELL"
#define ARCHCONFIG "-DHASWELL " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
@ -363,10 +406,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3"
#define LIBNAME "haswell"
#define CORENAME "HASWELL"
#endif
#else
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#define SUBARCHITECTURE "SKYLAKEX"
#define ARCHCONFIG "-DSKYLAKEX " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
@ -380,10 +421,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#ifdef FORCE_COOPERLAKE
#ifdef NO_AVX512
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#ifdef NO_AVX512
#ifdef NO_AVX2
#ifdef NO_AVX
#define SUBARCHITECTURE "NEHALEM"
#define ARCHCONFIG "-DNEHALEM " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2"
#define LIBNAME "nehalem"
#define CORENAME "NEHALEM"
#else
#define SUBARCHITECTURE "SANDYBRIDGE"
#define ARCHCONFIG "-DSANDYBRIDGE " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX"
#define LIBNAME "sandybridge"
#define CORENAME "SANDYBRIDGE"
#endif
#else
#define SUBARCHITECTURE "HASWELL"
#define ARCHCONFIG "-DHASWELL " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
@ -393,10 +455,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3"
#define LIBNAME "haswell"
#define CORENAME "HASWELL"
#endif
#else
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#define SUBARCHITECTURE "COOPERLAKE"
#define ARCHCONFIG "-DCOOPERLAKE " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
@ -564,6 +624,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#ifdef NO_AVX2
#ifdef NO_AVX
#define SUBARCHITECTURE "NEHALEM"
#define ARCHCONFIG "-DNEHALEM " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2"
#define LIBNAME "nehalem"
#define CORENAME "NEHALEM"
#else
#define SUBARCHITECTURE "SANDYBRIDGE"
#define ARCHCONFIG "-DSANDYBRIDGE " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
@ -572,6 +642,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX"
#define LIBNAME "sandybridge"
#define CORENAME "SANDYBRIDGE"
#endif
#else
#define SUBARCHITECTURE "ZEN"
#define ARCHCONFIG "-DZEN " \

View File

@ -82,6 +82,7 @@ foreach (CBLAS_FLAG ${CBLAS_FLAGS})
GenerateNamedObjects("${BLAS3_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${DISABLE_COMPLEX})
GenerateNamedObjects("${BLAS3_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX})
GenerateNamedObjects("xerbla.c" "" "xerbla" ${CBLAS_FLAG} "" "" true)
#sdsdot, dsdot
if (BUILD_SINGLE OR BUILD_DOUBLE)
GenerateNamedObjects("sdsdot.c" "" "sdsdot" ${CBLAS_FLAG} "" "" true "SINGLE")
@ -104,6 +105,15 @@ endif ()
GenerateNamedObjects("imax.c" "USE_ABS;USE_MIN" "i*amin" ${CBLAS_FLAG})
GenerateNamedObjects("imax.c" "USE_MIN" "i*min" ${CBLAS_FLAG})
if (BUILD_BFLOAT16)
GenerateNamedObjects("bf16dot.c" "" "sbdot" ${CBLAS_FLAG} "" "" true "BFLOAT16")
GenerateNamedObjects("gemm.c" "" "sbgemm" ${CBLAS_FLAG} "" "" true "BFLOAT16")
GenerateNamedObjects("sbgemv.c" "" "sbgemv" ${CBLAS_FLAG} "" "" true "BFLOAT16")
GenerateNamedObjects("tobf16.c" "SINGLE_PREC" "sbstobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16")
GenerateNamedObjects("tobf16.c" "DOUBLE_PREC" "sbdtobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16")
GenerateNamedObjects("bf16to.c" "SINGLE_PREC" "sbf16tos" ${CBLAS_FLAG} "" "" true "BFLOAT16")
GenerateNamedObjects("bf16to.c" "DOUBLE_PREC" "dbf16tod" ${CBLAS_FLAG} "" "" true "BFLOAT16")
endif ()
# complex-specific sources
foreach (float_type ${FLOAT_TYPES})

View File

@ -326,7 +326,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
PRINT_DEBUG_CNAME;
#if !defined(COMPLEX) && !defined(DOUBLE) && defined(USE_SGEMM_KERNEL_DIRECT)
#if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && defined(USE_SGEMM_KERNEL_DIRECT)
#ifdef DYNAMIC_ARCH
if (support_avx512() )
#endif

View File

@ -119,7 +119,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLOAT *x, int incx, FLOAT *a, int lda) {
FLOAT *buffer;
int trans, uplo;
int uplo;
blasint info;
FLOAT * ALPHA = &alpha;
FLOAT alpha_r = ALPHA[0];
@ -130,7 +130,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLO
PRINT_DEBUG_CNAME;
trans = -1;
uplo = -1;
info = 0;

View File

@ -91,6 +91,15 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
GenerateNamedObjects("${KERNELDIR}/${DSDOTKERNEL}" "DSDOT" "d*dot_k" false "" "" false "SINGLE")
GenerateNamedObjects("${KERNELDIR}/${DSDOTKERNEL}" "DSDOT" "dsdot_k" false "" "" false "SINGLE")
# sbdot
if (BUILD_BFLOAT16)
GenerateNamedObjects("${KERNELDIR}/${SBDOTKERNEL}" "SBDOT" "dot_k" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${BF16TOKERNEL}" "SINGLE" "f16tos_k" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${BF16TOKERNEL}" "DOUBLE" "bf16tod_k" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${TOBF16KERNEL}" "SINGLE" "stobf16_k" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${TOBF16KERNEL}" "DOUBLE" "dtobf16_k" false "" "" false "BFLOAT16")
endif()
if ((BUILD_COMPLEX OR BUILD_DOUBLE) AND NOT BUILD_SINGLE)
GenerateNamedObjects("${KERNELDIR}/${SAMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false "SINGLE")
GenerateNamedObjects("${KERNELDIR}/${SAMINKERNEL}" "USE_ABS;USE_MIN" "amin_k" false "" "" false "SINGLE")
@ -149,9 +158,6 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
GenerateNamedObjects("generic/ger.c" "" "ger_k" false "" "" "" 3)
foreach (float_type ${FLOAT_TYPES})
string(SUBSTRING ${float_type} 0 1 float_char)
if (${float_type} STREQUAL "BFLOAT16")
set (float_char "SB")
endif ()
if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX")
GenerateNamedObjects("${KERNELDIR}/${${float_char}GERUKERNEL}" "" "geru_k" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GERCKERNEL}" "CONJ" "gerc_k" false "" "" false ${float_type})
@ -185,6 +191,10 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
GenerateNamedObjects("${KERNELDIR}/${SGEMVNKERNEL}" "" "gemv_n" false "" "" false "SINGLE")
GenerateNamedObjects("${KERNELDIR}/${SGEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false "SINGLE")
endif ()
if (BUILD_BFLOAT16)
GenerateNamedObjects("${KERNELDIR}/${SBGEMVNKERNEL}" "" "gemv_n" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SBGEMVTKERNEL}" "" "gemv_t" false "" "" false "BFLOAT16")
endif ()
# Makefile.L3
set(USE_TRMM false)
string(TOUPPER ${TARGET_CORE} UC_TARGET_CORE)
@ -209,15 +219,8 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPERFORMANT}" "" "gemm_direct_performant" false "" "" false SINGLE)
endif()
foreach (float_type SINGLE DOUBLE BFLOAT16)
foreach (float_type SINGLE DOUBLE)
string(SUBSTRING ${float_type} 0 1 float_char)
if (${float_type} STREQUAL "BFLOAT16")
if (NOT ${BUILD_BFLOAT16})
continue ()
else ()
set (float_char "SB")
endif ()
endif ()
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type})
endforeach()
if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE)
@ -253,11 +256,24 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
GenerateNamedObjects("${KERNELDIR}/${SGEMM_BETA}" "" "gemm_beta" false "" "" false "SINGLE")
endif ()
if (BUILD_BFLOAT16)
if (SBGEMMINCOPY)
GenerateNamedObjects("${KERNELDIR}/${SBGEMMINCOPY}" "" "${SBGEMMINCOPYOBJ}" false "" "" true "BFLOAT16")
endif ()
if (SBGEMMITCOPY)
GenerateNamedObjects("${KERNELDIR}/${SBGEMMITCOPY}" "" "${SBGEMMITCOPYOBJ}" false "" "" true "BFLOAT16")
endif ()
if (SBGEMMONCOPY)
GenerateNamedObjects("${KERNELDIR}/${SBGEMMONCOPY}" "" "${SBGEMMONCOPYOBJ}" false "" "" true "BFLOAT16")
endif ()
if (SBGEMMOTCOPY)
GenerateNamedObjects("${KERNELDIR}/${SBGEMMOTCOPY}" "" "${SBGEMMOTCOPYOBJ}" false "" "" true "BFLOAT16")
endif ()
GenerateNamedObjects("${KERNELDIR}/${SBGEMMKERNEL}" "" "gemm_kernel" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_BETA}" "" "gemm_beta" false "" "" false "BFLOAT16")
endif ()
foreach (float_type ${FLOAT_TYPES})
string(SUBSTRING ${float_type} 0 1 float_char)
if (${float_type} STREQUAL "BFLOAT16")
set (float_char "SB")
endif ()
if (${float_char}GEMMINCOPY)
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMINCOPY}" "${float_type}" "${${float_char}GEMMINCOPYOBJ}" false "" "" true ${float_type})
endif ()
@ -568,6 +584,44 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false ${float_type})
endif ()
if (BUILD_BFLOAT16)
if (NOT DEFINED SBGEMM_SMALL_M_PERMIT)
set(SBGEMM_SMALL_M_PERMIT ../generic/gemm_small_matrix_permit.c)
endif ()
if (NOT DEFINED SBGEMM_SMALL_K_NN)
set(SBGEMM_SMALL_K_NN ../generic/gemm_small_matrix_kernel_nn.c)
endif ()
if (NOT DEFINED SBGEMM_SMALL_K_NT)
set(SBGEMM_SMALL_K_NT ../generic/gemm_small_matrix_kernel_nt.c)
endif ()
if (NOT DEFINED SBGEMM_SMALL_K_TN)
set(SBGEMM_SMALL_K_TN ../generic/gemm_small_matrix_kernel_tn.c)
endif ()
if (NOT DEFINED SBGEMM_SMALL_K_TT)
set(SBGEMM_SMALL_K_TT ../generic/gemm_small_matrix_kernel_tt.c)
endif ()
if (NOT DEFINED SBGEMM_SMALL_K_B0_NN)
set(SBGEMM_SMALL_K_B0_NN ../generic/gemm_small_matrix_kernel_nn.c)
endif ()
if (NOT DEFINED SBGEMM_SMALL_K_B0_NT)
set(SBGEMM_SMALL_K_B0_NT ../generic/gemm_small_matrix_kernel_nt.c)
endif ()
if (NOT DEFINED SBGEMM_SMALL_K_B0_TN)
set(SBGEMM_SMALL_K_B0_TN ../generic/gemm_small_matrix_kernel_tn.c)
endif ()
if (NOT DEFINED SBGEMM_SMALL_K_B0_TT)
set($SBGEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_tt.c)
endif ()
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_M_PERMIT}" "" "gemm_small_matrix_permit" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NT}" "" "gemm_small_kernel_nt" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_TN}" "" "gemm_small_kernel_tn" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NT}" "" "gemm_small_kernel_tt" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NN}" "B0" "gemm_small_kernel_b0_nn" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_nt" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false "BFLOAT16")
endif ()
endif ()
if (NOT DEFINED ${float_char}OMATCOPY_CN)
@ -702,6 +756,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
#geadd
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEADD_KERNEL}" "" "geadd_k" false "" "" false ${float_type})
endforeach ()
if (BUILD_DOUBLE AND NOT BUILD_SINGLE)
GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_LN}" "UPPER;LN;TRSMKERNEL" "trsm_kernel_LN" false "" "" false "SINGLE")
GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_LT}" "LT;TRSMKERNEL" "trsm_kernel_LT" false "" "" false "SINGLE")
@ -840,22 +895,22 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
GenerateNamedObjects("generic/trsm_ltcopy_${SGEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" ${TSUFFIX} false "SINGLE")
if (SGEMMINCOPY)
GenerateNamedObjects("${KERNELDIR}/${SGEMMINCOPY}" "SINGLE" "${SGEMMINCOPYOBJ}" false "" "" true "SINGLE")
GenerateNamedObjects("${KERNELDIR}/${SGEMMINCOPY}" "SINGLE" "${SGEMMINCOPYOBJ}" false "" "" true "SINGLE")
endif ()
if (SGEMMITCOPY)
GenerateNamedObjects("${KERNELDIR}/${SGEMMITCOPY}" "SINGLE" "${SGEMMITCOPYOBJ}" false "" "" true "SINGLE")
endif ()
if (SGEMMONCOPY)
GenerateNamedObjects("${KERNELDIR}/${SGEMMONCOPY}" "SINGLE" "${SGEMMONCOPYOBJ}" false "" "" true "SINGLE")
endif ()
if (SGEMMOTCOPY)
GenerateNamedObjects("${KERNELDIR}/${SGEMMOTCOPY}" "SINGLE" "${SGEMMOTCOPYOBJ}" false "" "" true "SINGLE")
if (SGEMMITCOPY)
GenerateNamedObjects("${KERNELDIR}/${SGEMMITCOPY}" "SINGLE" "${SGEMMITCOPYOBJ}" false "" "" true "SINGLE")
endif ()
if (SGEMMONCOPY)
GenerateNamedObjects("${KERNELDIR}/${SGEMMONCOPY}" "SINGLE" "${SGEMMONCOPYOBJ}" false "" "" true "SINGLE")
endif ()
if (SGEMMOTCOPY)
GenerateNamedObjects("${KERNELDIR}/${SGEMMOTCOPY}" "SINGLE" "${SGEMMOTCOPYOBJ}" false "" "" true "SINGLE")
endif ()
GenerateNamedObjects("${KERNELDIR}/${SGEMVNKERNEL}" "" "gemv_n" false "" "" false "SINGLE")
GenerateNamedObjects("${KERNELDIR}/${SGEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false "SINGLE")
endif ()
if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE)
if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE)
GenerateNamedObjects("generic/neg_tcopy_${DGEMM_UNROLL_M}.c" "" "neg_tcopy" false "" ${TSUFFIX} false "DOUBLE")
GenerateNamedObjects("generic/laswp_ncopy_${DGEMM_UNROLL_N}.c" "" "laswp_ncopy" false "" ${TSUFFIX} false "DOUBLE")
endif ()

View File

@ -50,11 +50,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define B03 x16
#define B04 x17
#define I x18
#define J x19
#define I x19
#define J x20
#define TEMP1 x20
#define TEMP2 x21
#define TEMP1 x21
#define A_PREFETCH 2560
#define B_PREFETCH 256

View File

@ -49,9 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define pCRow3 x15
#define pA x16
#define alpha x17
#define temp x18
//#define temp x18
#define tempOffset x19
#define tempK x20
#define temp x21
#define alpha0 d10
#define alphaV0 v10.d[0]

View File

@ -30,7 +30,7 @@ All rights reserved.
#define B00 x22
#define I x18
#define I x21
#define J x19
#define TEMP1 x20

View File

@ -49,9 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define pCRow3 x15
#define pA x16
#define alpha w17
#define temp x18
//#define temp x18
#define tempOffset x19
#define tempK x20
#define temp x21
#define alpha0 s10
#define alphaV0 v10.s[0]

View File

@ -48,8 +48,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define pCRow2 x14
#define pCRow3 x15
#define pA x16
#define alphaR x17
#define alphaI x18
#define alphaR x19
#define alphaI x20
#define alpha0_R d10
#define alphaV0_R v10.d[0]

View File

@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define pCRow3 x15
#define pA x16
#define alphaR x17
#define alphaI x18
#define alphaI x22
#define temp x19
#define tempOffset x20
#define tempK x21

View File

@ -47,7 +47,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
if ( (inc_x == 1) && (inc_y == 1) )
{
int n1 = n & -4;
#if V_SIMD && !defined(DSDOT)
const int vstep = v_nlanes_f32;
const int unrollx4 = n & (-vstep * 4);
@ -84,6 +83,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
}
dot = v_sum_f32(vsum0);
#elif defined(DSDOT)
int n1 = n & -4;
for (; i < n1; i += 4)
{
dot += (double) y[i] * (double) x[i]
@ -92,6 +92,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+ (double) y[i+3] * (double) x[i+3] ;
}
#else
int n1 = n & -4;
for (; i < n1; i += 4)
{
dot += y[i] * x[i]

View File

@ -1,7 +1,6 @@
ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__)
ifeq ($(HAVE_GAS), 1)
include $(KERNELDIR)/KERNEL.POWER8
else
#SGEMM_BETA = ../generic/gemm_beta.c
#DGEMM_BETA = ../generic/gemm_beta.c
#CGEMM_BETA = ../generic/zgemm_beta.c
@ -44,6 +43,7 @@ DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = cgemm_kernel_power10.S
#CGEMMKERNEL = cgemm_kernel_8x4_power8.S
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
@ -218,5 +218,4 @@ QCABS_KERNEL = ../generic/cabs.c
#Dump kernel
CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
endif

View File

@ -36,9 +36,12 @@ static void caxpy_kernel_8 (long n, float *x, float *y,
#endif
const float *mvecp = mvec;
/* We have to load reverse mask for big endian. */
/* __vector unsigned char mask={ 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}; */
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
__vector unsigned char mask={ 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11};
#else
__vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4};
#endif
long ytmp;
__asm__
@ -112,6 +115,16 @@ static void caxpy_kernel_8 (long n, float *x, float *y,
"xvmaddasp 38, 58, 33 \n\t"
"xvmaddasp 39, 59, 33 \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"stxv 48, 0(%4) \n\t"
"stxv 49, 16(%4) \n\t"
"stxv 50, 32(%4) \n\t"
"stxv 51, 48(%4) \n\t"
"stxv 34, 64(%4) \n\t"
"stxv 35, 80(%4) \n\t"
"stxv 38, 96(%4) \n\t"
"stxv 39, 112(%4) \n\t"
#else
"stxv 49, 0(%4) \n\t"
"stxv 48, 16(%4) \n\t"
"stxv 51, 32(%4) \n\t"
@ -120,6 +133,7 @@ static void caxpy_kernel_8 (long n, float *x, float *y,
"stxv 34, 80(%4) \n\t"
"stxv 39, 96(%4) \n\t"
"stxv 38, 112(%4) \n\t"
#endif
"addi %4, %4, 128 \n\t"
"xxperm 52, 40, %x10 \n\t" // exchange real and imag part
@ -163,6 +177,16 @@ static void caxpy_kernel_8 (long n, float *x, float *y,
"xvmaddasp 38, 58, 33 \n\t"
"xvmaddasp 39, 59, 33 \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"stxv 48, 0(%4) \n\t"
"stxv 49, 16(%4) \n\t"
"stxv 50, 32(%4) \n\t"
"stxv 51, 48(%4) \n\t"
"stxv 34, 64(%4) \n\t"
"stxv 35, 80(%4) \n\t"
"stxv 38, 96(%4) \n\t"
"stxv 39, 112(%4) \n\t"
#else
"stxv 49, 0(%4) \n\t"
"stxv 48, 16(%4) \n\t"
"stxv 51, 32(%4) \n\t"
@ -171,6 +195,7 @@ static void caxpy_kernel_8 (long n, float *x, float *y,
"stxv 34, 80(%4) \n\t"
"stxv 39, 96(%4) \n\t"
"stxv 38, 112(%4) \n\t"
#endif
"#n=%1 x=%5=%2 y=%0=%3 alpha=(%7,%8) mvecp=%6=%9 ytmp=%4\n"
:

View File

@ -46,7 +46,16 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
".align 5 \n"
"one%=: \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"stxv 32, 0(%3) \n\t"
"stxv 33, 16(%3) \n\t"
"stxv 34, 32(%3) \n\t"
"stxv 35, 48(%3) \n\t"
"stxv 36, 64(%3) \n\t"
"stxv 37, 80(%3) \n\t"
"stxv 38, 96(%3) \n\t"
"stxv 39, 112(%3) \n\t"
#else
"stxv 33, 0(%3) \n\t"
"stxv 32, 16(%3) \n\t"
"stxv 35, 32(%3) \n\t"
@ -55,11 +64,21 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
"stxv 36, 80(%3) \n\t"
"stxv 39, 96(%3) \n\t"
"stxv 38, 112(%3) \n\t"
#endif
"lxvp 32, 0(%2) \n\t"
"lxvp 34, 32(%2) \n\t"
"lxvp 36, 64(%2) \n\t"
"lxvp 38, 96(%2) \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"stxv 40, 128(%3) \n\t"
"stxv 41, 144(%3) \n\t"
"stxv 42, 160(%3) \n\t"
"stxv 43, 176(%3) \n\t"
"stxv 44, 192(%3) \n\t"
"stxv 45, 208(%3) \n\t"
"stxv 46, 224(%3) \n\t"
"stxv 47, 240(%3) \n\t"
#else
"stxv 41, 128(%3) \n\t"
"stxv 40, 144(%3) \n\t"
"stxv 43, 160(%3) \n\t"
@ -68,6 +87,7 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
"stxv 44, 208(%3) \n\t"
"stxv 47, 224(%3) \n\t"
"stxv 46, 240(%3) \n\t"
#endif
"lxvp 40, 128(%2) \n\t"
"lxvp 42, 160(%2) \n\t"
"lxvp 44, 192(%2) \n\t"
@ -81,7 +101,24 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
"bgt one%= \n"
"two%=: \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"stxv 32, 0(%3) \n\t"
"stxv 33, 16(%3) \n\t"
"stxv 34, 32(%3) \n\t"
"stxv 35, 48(%3) \n\t"
"stxv 36, 64(%3) \n\t"
"stxv 37, 80(%3) \n\t"
"stxv 38, 96(%3) \n\t"
"stxv 39, 112(%3) \n\t"
"stxv 40, 128(%3) \n\t"
"stxv 41, 144(%3) \n\t"
"stxv 42, 160(%3) \n\t"
"stxv 43, 176(%3) \n\t"
"stxv 44, 192(%3) \n\t"
"stxv 45, 208(%3) \n\t"
"stxv 46, 224(%3) \n\t"
"stxv 47, 240(%3) \n\t"
#else
"stxv 33, 0(%3) \n\t"
"stxv 32, 16(%3) \n\t"
"stxv 35, 32(%3) \n\t"
@ -98,7 +135,7 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
"stxv 44, 208(%3) \n\t"
"stxv 47, 224(%3) \n\t"
"stxv 46, 240(%3) \n\t"
#endif
"#n=%1 x=%4=%2 y=%0=%3"
:
"=m" (*y),

View File

@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else
#include "common.h"
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#if defined(POWER10)
#include "cdot_microk_power10.c"
#else
#ifndef HAVE_KERNEL_8
@ -120,7 +120,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
if ((inc_x == 1) && (inc_y == 1)) {
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#if defined(POWER10)
BLASLONG n1 = n & -16;
#else
BLASLONG n1 = n & -8;

View File

@ -29,7 +29,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
static void cdot_kernel_8 (long n, float *x, float *y, float *dot)
{
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
__vector unsigned char mask = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
#else
__vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4};
#endif
__asm__
(
"dcbt 0, %2 \n\t"
@ -153,7 +157,11 @@ static void cdot_kernel_8 (long n, float *x, float *y, float *dot)
"xxswapd 33, 34 \n\t"
"xvaddsp 35, 35, 32 \n\t"
"xvaddsp 34, 34, 33 \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"xxpermdi 34, 35, 34, 0 \n\t"
#else
"xxpermdi 34, 34, 35, 2 \n\t"
#endif
"stxv 34, 0(%6) \n\t"
"#n=%1 x=%4=%2 y=%5=%3 dot=%0=%6"

View File

@ -76,11 +76,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "cgemm_macros_power10.S"
#if (_AIX)
.set perm_const1, 0x0405060700010203
.set perm_const2, 0x0c0d0e0f08090a0b
.set save_permute_12, 0x1011121300010203
.set save_permute_11, 0x18191a1b08090a0b
#else
.equ perm_const1, 0x0405060700010203
.equ perm_const2, 0x0c0d0e0f08090a0b
.equ save_permute_12, 0x0c0d0e0f1c1d1e1f
.equ save_permute_11, 0x0405060714151617
#endif
#ifndef NEEDPARAM
@ -172,24 +178,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/*load reverse permute mask for big endian
uint128 = 0xc0d0e0f08090a0b0405060700010203
*/
#if (_AIX)
lis T2, (perm_const2>>48 & 0xFFFF)
lis T1, (perm_const1>>48 & 0xFFFF)
lis T3, (save_permute_12>>48 & 0xFFFF)
lis T4, (save_permute_11>>48 & 0xFFFF)
ori T2, T2, (perm_const2>>32 & 0xFFFF)
ori T1, T1, (perm_const1>>32 & 0xFFFF)
ori T3, T3, (save_permute_12>>32 & 0xFFFF)
ori T4, T4, (save_permute_11>>32 & 0xFFFF)
#else
lis T2, perm_const2@highest
lis T1, perm_const1@highest
lis T3, save_permute_12@highest
lis T4, save_permute_11@highest
ori T2, T2, perm_const2@higher
ori T1, T1, perm_const1@higher
ori T3, T3, save_permute_12@higher
ori T4, T4, save_permute_11@higher
#endif
rldicr T2, T2, 32, 31
rldicr T1, T1, 32, 31
rldicr T3, T3, 32, 31
rldicr T4, T4, 32, 31
#if (_AIX)
oris T2, T2, (perm_const2>>16 & 0xFFFF)
oris T1, T1, (perm_const1>>16 & 0xFFFF)
oris T3, T3, (save_permute_12>>16 & 0xFFFF)
oris T4, T4, (save_permute_11>>16 & 0xFFFF)
ori T2, T2, (perm_const2 & 0xFFFF)
ori T1, T1, (perm_const1 & 0xFFFF)
ori T3, T3, (save_permute_12 & 0xFFFF)
ori T4, T4, (save_permute_11 & 0xFFFF)
#else
oris T2, T2, perm_const2@h
oris T1, T1, perm_const1@h
oris T3, T3, save_permute_12@h
@ -200,7 +226,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ori T1, T1, perm_const1@l
ori T3, T3, save_permute_12@l
ori T4, T4, save_permute_11@l
#endif
li r0,0
li PRE,512

View File

@ -218,6 +218,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.if \OffsetA != 0
addi \AREG, \AREG, \OffsetA
.endif
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf32gerpp 3, 36, 34
xvf32gerpp 2, 37, 34
xvf32gerpp 1, 32, 34
xvf32gerpp 0, 33, 34
xvf32gerpp 7, 36, 35
xvf32gerpp 6, 37, 35
xvf32gerpp 5, 32, 35
xvf32gerpp 4, 33, 35
#else
xvf32gerpp 3, 36, 35
xvf32gerpp 2, 37, 35
xvf32gerpp 1, 32, 35
@ -226,6 +236,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvf32gerpp 6, 37, 34
xvf32gerpp 5, 32, 34
xvf32gerpp 4, 33, 34
#endif
.endm
.macro LOAD4x8_2
@ -255,6 +266,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL4x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
xvf32gerpp 3, 36, 34
xvf32gerpp 2, 37, 34
xvf32gerpp 1, 32, 34
xvf32gerpp 0, 33, 34
xvf32gerpp 7, 36, 35
xvf32gerpp 6, 37, 35
xvf32gerpp 5, 32, 35
xvf32gerpp 4, 33, 35
#else
xvf32gerpp 3, 36, 35
xvf32gerpp 2, 37, 35
xvf32gerpp 1, 32, 35
@ -263,11 +284,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvf32gerpp 6, 37, 34
xvf32gerpp 5, 32, 34
xvf32gerpp 4, 33, 34
#endif
.if \Complete==0
lxvp vs34, DISP8(\Index, \OffsetB)(\BREG)
lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG)
lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG)
.endif
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
xvf32gerpp 3, 42, 38
xvf32gerpp 2, 43, 38
xvf32gerpp 1, 40, 38
xvf32gerpp 0, 41, 38
xvf32gerpp 7, 42, 39
xvf32gerpp 6, 43, 39
xvf32gerpp 5, 40, 39
xvf32gerpp 4, 41, 39
#else
xvf32gerpp 3, 42, 39
xvf32gerpp 2, 43, 39
xvf32gerpp 1, 40, 39
@ -276,6 +308,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvf32gerpp 6, 43, 38
xvf32gerpp 5, 40, 38
xvf32gerpp 4, 41, 38
#endif
.if \Complete==0
lxvp vs40, DISP16(\Index, 64+\OffsetA)(\AREG)
lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG)
@ -393,22 +426,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
RECONSTRUCT_PAIR2
#ifndef TRMMKERNEL
/* add */
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs1, vs0, vs8, 1
xxpermdi vs3, vs2, vs10, 1
xxpermdi vs5, vs4, vs12, 1
xxpermdi vs7, vs6, vs14, 1
xxpermdi vs9, vs8, vs0, 1
xxpermdi vs11, vs10, vs2, 1
#else
xxpermdi vs1, vs8, vs0, 2
xxpermdi vs3, vs10, vs2, 2
xxpermdi vs5, vs12, vs4, 2
xxpermdi vs7, vs14, vs6, 2
xxpermdi vs9, vs0, vs8, 2
xxpermdi vs11, vs2, vs10, 2
#endif
xvaddsp vs24, vs24, vs3
xvaddsp vs25, vs25, vs1
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs13, vs12, vs4, 1
xxpermdi vs15, vs14, vs6, 1
#else
xxpermdi vs13, vs4, vs12, 2
xxpermdi vs15, vs6, vs14, 2
#endif
xvaddsp vs26, vs26, vs7
xvaddsp vs27, vs27, vs5
xvaddsp vs28, vs28, vs11
xvaddsp vs29, vs29, vs9
xvaddsp vs30, vs30, vs15
xvaddsp vs31, vs31, vs13
#else
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
xxpermdi vs25, vs0, vs8, 1
xxpermdi vs24, vs2, vs10, 1
xxpermdi vs27, vs4, vs12, 1
xxpermdi vs26, vs6, vs14, 1
xxpermdi vs29, vs8, vs0, 1
xxpermdi vs28, vs10, vs2, 1
xxpermdi vs31, vs12, vs4, 1
xxpermdi vs30, vs14, vs6, 1
#else
xxpermdi vs25, vs8, vs0, 2
xxpermdi vs24, vs10, vs2, 2
@ -418,6 +475,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxpermdi vs28, vs2, vs10, 2
xxpermdi vs31, vs4, vs12, 2
xxpermdi vs30, vs6, vs14, 2
#endif
#endif
stxvp vs24, 0(CO)
MULT_APLHA_PART1 vs48, vs56, vs0, vs1
@ -443,22 +501,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
RECONSTRUCT_PAIR2
#ifndef TRMMKERNEL
/* add */
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs1, vs0, vs8, 1
xxpermdi vs3, vs2, vs10, 1
xxpermdi vs5, vs4, vs12, 1
xxpermdi vs7, vs6, vs14, 1
xxpermdi vs9, vs8, vs0, 1
xxpermdi vs11, vs10, vs2, 1
#else
xxpermdi vs1, vs8, vs0, 2
xxpermdi vs3, vs10, vs2, 2
xxpermdi vs5, vs12, vs4, 2
xxpermdi vs7, vs14, vs6, 2
xxpermdi vs9, vs0, vs8, 2
xxpermdi vs11, vs2, vs10, 2
#endif
xvaddsp vs32, vs32, vs3
xvaddsp vs33, vs33, vs1
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs13, vs12, vs4, 1
xxpermdi vs15, vs14, vs6, 1
#else
xxpermdi vs13, vs4, vs12, 2
xxpermdi vs15, vs6, vs14, 2
#endif
xvaddsp vs40, vs40, vs7
xvaddsp vs41, vs41, vs5
xvaddsp vs34, vs34, vs11
xvaddsp vs35, vs35, vs9
xvaddsp vs42, vs42, vs15
xvaddsp vs43, vs43, vs13
#else
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
xxpermdi vs33, vs0, vs8, 1
xxpermdi vs32, vs2, vs10, 1
xxpermdi vs41, vs4, vs12, 1
xxpermdi vs40, vs6, vs14, 1
xxpermdi vs35, vs8, vs0, 1
xxpermdi vs34, vs10, vs2, 1
xxpermdi vs43, vs12, vs4, 1
xxpermdi vs42, vs14, vs6, 1
#else
xxpermdi vs33, vs8, vs0, 2
xxpermdi vs32, vs10, vs2, 2
@ -468,6 +550,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxpermdi vs34, vs2, vs10, 2
xxpermdi vs43, vs4, vs12, 2
xxpermdi vs42, vs6, vs14, 2
#endif
#endif
stxvp vs32, 0(T2)
stxvp vs40, 32(T2)
@ -510,10 +593,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.if \OffsetA != 0
addi \AREG, \AREG, \OffsetA
.endif
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf32gerpp 3, 32, 35
xvf32gerpp 2, 33, 35
xvf32gerpp 1, 32, 34
xvf32gerpp 0, 33, 34
#else
xvf32gerpp 3, 32, 34
xvf32gerpp 2, 33, 34
xvf32gerpp 1, 32, 35
xvf32gerpp 0, 33, 35
#endif
.endm
.macro LOAD4x4_2
@ -541,18 +631,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL4x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf32gerpp 3, 32, 35
xvf32gerpp 2, 33, 35
xvf32gerpp 1, 32, 34
xvf32gerpp 0, 33, 34
#else
xvf32gerpp 3, 32, 34
xvf32gerpp 2, 33, 34
xvf32gerpp 1, 32, 35
xvf32gerpp 0, 33, 35
#endif
.if \Complete==0
lxvp vs34, DISP8(\Index, \OffsetB)(\BREG)
lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG)
.endif
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf32gerpp 3, 36, 39
xvf32gerpp 2, 37, 39
xvf32gerpp 1, 36, 38
xvf32gerpp 0, 37, 38
#else
xvf32gerpp 3, 36, 38
xvf32gerpp 2, 37, 38
xvf32gerpp 1, 36, 39
xvf32gerpp 0, 37, 39
#endif
.if \Complete==0
lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG)
lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG)
@ -606,6 +710,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
RECONSTRUCT_PAIR2
#ifndef TRMMKERNEL
/* add */
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs1, vs0, vs8, 1
xxpermdi vs3, vs2, vs10, 1
xxpermdi vs9, vs8, vs0, 1
xxpermdi vs11, vs10, vs2, 1
xxpermdi vs5, vs4, vs12, 1
xxpermdi vs7, vs6, vs14, 1
xxpermdi vs13, vs12, vs4, 1
xxpermdi vs15, vs14, vs6, 1
#else
xxpermdi vs1, vs8, vs0, 2
xxpermdi vs3, vs10, vs2, 2
xxpermdi vs9, vs0, vs8, 2
@ -614,6 +728,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxpermdi vs7, vs14, vs6, 2
xxpermdi vs13, vs4, vs12, 2
xxpermdi vs15, vs6, vs14, 2
#endif
xvaddsp vs24, vs24, vs3
xvaddsp vs25, vs25, vs1
xvaddsp vs26, vs26, vs11
@ -622,6 +737,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvaddsp vs29, vs29, vs5
xvaddsp vs30, vs30, vs15
xvaddsp vs31, vs31, vs13
#else
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs25, vs0, vs8, 1
xxpermdi vs24, vs2, vs10, 1
xxpermdi vs27, vs8, vs0, 1
xxpermdi vs26, vs10, vs2, 1
xxpermdi vs29, vs4, vs12, 1
xxpermdi vs28, vs6, vs14, 1
xxpermdi vs31, vs12, vs4, 1
xxpermdi vs30, vs14, vs6, 1
#else
xxpermdi vs25, vs8, vs0, 2
xxpermdi vs24, vs10, vs2, 2
@ -631,6 +756,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxpermdi vs28, vs14, vs6, 2
xxpermdi vs31, vs4, vs12, 2
xxpermdi vs30, vs6, vs14, 2
#endif
#endif
stxvp vs24, 0(CO)
stxvp vs26, 0(T1)
@ -672,8 +798,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.if \OffsetA != 0
addi \AREG, \AREG, \OffsetA
.endif
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf32gerpp 1, 35, 32
xvf32gerpp 0, 34, 32
#else
xvf32gerpp 1, 34, 32
xvf32gerpp 0, 35, 32
#endif
.endm
.macro LOAD4x2_2
@ -700,13 +831,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL4x2_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf32gerpp 1, 35, 32
xvf32gerpp 0, 34, 32
#else
xvf32gerpp 1, 34, 33
xvf32gerpp 0, 35, 33
#endif
.if \Complete==0
lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG)
.endif
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf32gerpp 1, 37, 33
xvf32gerpp 0, 36, 33
#else
xvf32gerpp 1, 36, 32
xvf32gerpp 0, 37, 32
#endif
.if \Complete==0
lxvp vs32, DISP4(\Index, \OffsetA)(\AREG)
lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG)
@ -757,19 +898,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
RECONSTRUCT_PAIR1
#ifndef TRMMKERNEL
/* add */
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs1, vs0, vs8, 0
xxpermdi vs9, vs2, vs10, 0
xxpermdi vs3, vs8, vs0, 3
xxpermdi vs11, vs10, vs2, 3
#else
xxpermdi vs1, vs8, vs0, 0
xxpermdi vs9, vs10, vs2, 0
xxpermdi vs3, vs0, vs8, 3
xxpermdi vs11, vs2, vs10, 3
#endif
xvaddsp vs24, vs24, vs1
xvaddsp vs26, vs26, vs9
xvaddsp vs25, vs25, vs3
xvaddsp vs27, vs27, vs11
#else
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs24, vs0, vs8, 0
xxpermdi vs26, vs2, vs10, 0
xxpermdi vs25, vs8, vs0, 3
xxpermdi vs27, vs10, vs2, 3
#else
xxpermdi vs24, vs8, vs0, 0
xxpermdi vs26, vs10, vs2, 0
xxpermdi vs25, vs0, vs8, 3
xxpermdi vs27, vs2, vs10, 3
#endif
#endif
stxv vs24, 0(CO)
stxv vs25, 0(T1)
@ -811,8 +966,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.if \OffsetA != 0
addi \AREG, \AREG, \OffsetA
.endif
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf32gerpp 0, 34, 32
xvf32gerpp 1, 35, 32
#else
xvf32gerpp 0, 35, 32
xvf32gerpp 1, 34, 32
#endif
.endm
.macro LOAD4x1_2
@ -822,8 +982,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro LOAD4x1_2O OffsetA, OffsetB
lxv vs32, (\OffsetA)(AO)
vspltisb v6, 0
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs33, vs32, vs38, 2
xxpermdi vs32, vs32, vs38, 0
#else
xxpermdi vs33, vs32, vs38, 0
xxpermdi vs32, vs32, vs38, 2
#endif
lxvp vs34, (0+\OffsetB)(BO)
lxvp vs36, (32+\OffsetB)(BO)
.endm
@ -842,18 +1007,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL4x1_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf32gerpp 0, 34, 32
xvf32gerpp 1, 35, 32
#else
xvf32gerpp 0, 35, 32
xvf32gerpp 1, 34, 32
#endif
.if \Complete==0
lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG)
.endif
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf32gerpp 0, 36, 33
xvf32gerpp 1, 37, 33
#else
xvf32gerpp 0, 37, 33
xvf32gerpp 1, 36, 33
#endif
.if \Complete==0
lxv vs32, DISP2(\Index, \OffsetA)(\AREG)
lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG)
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs33, vs32, vs38, 2
xxpermdi vs32, vs32, vs38, 0
#else
xxpermdi vs33, vs32, vs38, 0
xxpermdi vs32, vs32, vs38, 2
#endif
.endif
.if \IsLast==1
.if \Complete==1
@ -1001,19 +1181,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL2x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf32gerpp 2, 37, 34
xvf32gerpp 3, 36, 34
xvf32gerpp 0, 33, 34
xvf32gerpp 1, 32, 34
#else
xvf32gerpp 2, 37, 35
xvf32gerpp 3, 36, 35
xvf32gerpp 0, 33, 35
xvf32gerpp 1, 32, 35
#endif
.if \Complete==0
lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG)
lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG)
.endif
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf32gerpp 2, 41, 35
xvf32gerpp 3, 40, 35
xvf32gerpp 0, 39, 35
xvf32gerpp 1, 38, 35
#else
xvf32gerpp 2, 41, 34
xvf32gerpp 3, 40, 34
xvf32gerpp 0, 39, 34
xvf32gerpp 1, 38, 34
#endif
.if \Complete==0
lxvp vs34, DISP4(\Index, \OffsetB)(\BREG)
@ -1068,22 +1262,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
RECONSTRUCT_PAIR2
#ifndef TRMMKERNEL
/* add */
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs1, vs0, vs8, 1
xxpermdi vs3, vs2, vs10, 1
xxpermdi vs5, vs4, vs12, 1
xxpermdi vs7, vs6, vs14, 1
xxpermdi vs9, vs8, vs0, 1
xxpermdi vs11, vs10, vs2, 1
#else
xxpermdi vs1, vs8, vs0, 2
xxpermdi vs3, vs10, vs2, 2
xxpermdi vs5, vs12, vs4, 2
xxpermdi vs7, vs14, vs6, 2
xxpermdi vs9, vs0, vs8, 2
xxpermdi vs11, vs2, vs10, 2
#endif
xvaddsp vs24, vs24, vs3
xvaddsp vs25, vs25, vs1
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs13, vs12, vs4, 1
xxpermdi vs15, vs14, vs6, 1
#else
xxpermdi vs13, vs4, vs12, 2
xxpermdi vs15, vs6, vs14, 2
#endif
xvaddsp vs26, vs26, vs7
xvaddsp vs27, vs27, vs5
xvaddsp vs28, vs28, vs11
xvaddsp vs29, vs29, vs9
xvaddsp vs30, vs30, vs15
xvaddsp vs31, vs31, vs13
#else
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs25, vs0, vs8, 1
xxpermdi vs24, vs2, vs10, 1
xxpermdi vs27, vs4, vs12, 1
xxpermdi vs26, vs6, vs14, 1
xxpermdi vs29, vs8, vs0, 1
xxpermdi vs28, vs10, vs2, 1
xxpermdi vs31, vs12, vs4, 1
xxpermdi vs30, vs14, vs6, 1
#else
xxpermdi vs25, vs8, vs0, 2
xxpermdi vs24, vs10, vs2, 2
@ -1093,6 +1311,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxpermdi vs28, vs2, vs10, 2
xxpermdi vs31, vs4, vs12, 2
xxpermdi vs30, vs6, vs14, 2
#endif
#endif
stxvp vs24, 0(CO)
stxvp vs26, 32(CO)
@ -1161,13 +1380,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL2x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf32gerpp 0, 33, 34
xvf32gerpp 1, 32, 34
#else
xvf32gerpp 0, 33, 35
xvf32gerpp 1, 32, 35
#endif
.if \Complete==0
lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG)
.endif
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf32gerpp 0, 37, 35
xvf32gerpp 1, 36, 35
#else
xvf32gerpp 0, 37, 34
xvf32gerpp 1, 36, 34
#endif
.if \Complete==0
lxvp vs34, DISP4(\Index, \OffsetB)(\BREG)
lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG)
@ -1206,19 +1436,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
RECONSTRUCT_PAIR1
#ifndef TRMMKERNEL
/* add */
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs1, vs0, vs8, 1
xxpermdi vs3, vs2, vs10, 1
xxpermdi vs9, vs8, vs0, 1
xxpermdi vs11, vs10, vs2, 1
#else
xxpermdi vs1, vs8, vs0, 2
xxpermdi vs3, vs10, vs2, 2
xxpermdi vs9, vs0, vs8, 2
xxpermdi vs11, vs2, vs10, 2
#endif
xvaddsp vs24, vs24, vs3
xvaddsp vs25, vs25, vs1
xvaddsp vs26, vs26, vs11
xvaddsp vs27, vs27, vs9
#else
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs25, vs0, vs8, 1
xxpermdi vs24, vs2, vs10, 1
xxpermdi vs27, vs8, vs0, 1
xxpermdi vs26, vs10, vs2, 1
#else
xxpermdi vs25, vs8, vs0, 2
xxpermdi vs24, vs10, vs2, 2
xxpermdi vs27, vs0, vs8, 2
xxpermdi vs26, vs2, vs10, 2
#endif
#endif
stxvp vs24, 0(CO)
stxvp vs26, 0(T1)
@ -1330,13 +1574,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxperm vs8, vs9, save_permute_1
#ifndef TRMMKERNEL
/* add */
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs1, vs0, vs8, 0
xxpermdi vs9, vs8, vs0, 3
#else
xxpermdi vs1, vs8, vs0, 0
xxpermdi vs9, vs0, vs8, 3
#endif
xvaddsp vs24, vs24, vs1
xvaddsp vs26, vs26, vs9
#else
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs24, vs0, vs8, 0
xxpermdi vs26, vs8, vs0, 3
#else
xxpermdi vs24, vs8, vs0, 0
xxpermdi vs26, vs0, vs8, 3
#endif
#endif
stxv vs24, 0(CO)
stxv vs26, 0(T1)
@ -1528,8 +1782,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lxvp vs32, (0+\OffsetA)(AO)
lxvp vs36, (32+\OffsetA)(AO)
vspltisb v10, 0
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs35, vs34, vs42, 2
xxpermdi vs34, vs34, vs42, 0
#else
xxpermdi vs35, vs34, vs42, 0
xxpermdi vs34, vs34, vs42, 2
#endif
lxvp vs38, (64+\OffsetA)(AO)
lxvp vs40, (64+32+\OffsetA)(AO)
.endm
@ -1567,8 +1826,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvf32gerpp 3, 35, 40
.if \Complete==0
lxv vs34, DISP2(\Index, \OffsetB)(\BREG)
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs35, vs34, vs42, 2
xxpermdi vs34, vs34, vs42, 0
#else
xxpermdi vs35, vs34, vs42, 0
xxpermdi vs34, vs34, vs42, 2
#endif
lxvp vs40, DISP16(\Index, 64+32+\OffsetA)(\AREG)
.endif
.if \IsLast==1
@ -1634,10 +1898,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
MULT_APLHA_PART2 vs34, vs42, vs4, vs5
MULT_APLHA_PART2 vs35, vs43, vs6, vs7
/* reconstruct r, i pairs*/
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxperm vs0, vs1, save_permute_1
xxperm vs2, vs3, save_permute_1
xxperm vs4, vs5, save_permute_1
xxperm vs6, vs7, save_permute_1
#else
xxperm vs0, vs1, vs28
xxperm vs2, vs3, vs28
xxperm vs4, vs5, vs28
xxperm vs6, vs7, vs28
#endif
#ifndef TRMMKERNEL
/* add */
xvaddsp vs24, vs24, vs2
@ -1648,10 +1919,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvp vs26, 32(CO)
#else
/* reconstruct r, i pairs*/
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
stxv vs2, 0(CO)
stxv vs0, 16(CO)
stxv vs6, 32(CO)
stxv vs4, 48(CO)
#else
stxv vs0, 0(CO)
stxv vs2, 16(CO)
stxv vs4, 32(CO)
stxv vs6, 48(CO)
#endif
#endif
addi CO, CO, 64
.endm
@ -1701,8 +1979,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lxv vs34, (\OffsetB)(BO)
lxvp vs32, (0+\OffsetA)(AO)
vspltisb v6, 0
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs35, vs34, vs38, 2
xxpermdi vs34, vs34, vs38, 0
#else
xxpermdi vs35, vs34, vs38, 0
xxpermdi vs34, vs34, vs38, 2
#endif
lxvp vs36, (32+\OffsetA)(AO)
.endm
@ -1729,8 +2012,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvf32gerpp 1, 35, 36
.if \Complete==0
lxv vs34, DISP2(\Index, \OffsetB)(\BREG)
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxpermdi vs35, vs34, vs38, 2
xxpermdi vs34, vs34, vs38, 0
#else
xxpermdi vs35, vs34, vs38, 0
xxpermdi vs34, vs34, vs38, 2
#endif
lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG)
.endif
.if \IsLast==1
@ -1775,8 +2063,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
MULT_APLHA_PART2 vs32, vs40, vs0, vs1
MULT_APLHA_PART2 vs33, vs41, vs2, vs3
/* reconstruct r, i pairs*/
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxperm vs0, vs1, save_permute_1
xxperm vs2, vs3, save_permute_1
#else
xxperm vs0, vs1, vs28
xxperm vs2, vs3, vs28
#endif
#ifndef TRMMKERNEL
/* add */
xvaddsp vs24, vs24, vs2
@ -1784,8 +2077,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvp vs24, 0(CO)
#else
/* reconstruct r, i pairs*/
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
stxv vs2, 0(CO)
stxv vs0, 16(CO)
#else
stxv vs0, 0(CO)
stxv vs2, 16(CO)
#endif
#endif
addi CO, CO, 32
.endm
@ -1904,7 +2202,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
MULT_APLHA_PART1 vs32, vs40, vs0, vs1
MULT_APLHA_PART2 vs32, vs40, vs0, vs1
/* reconstruct r, i pairs*/
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxperm vs0, vs1, save_permute_1
#else
xxperm vs0, vs1, vs28
#endif
#ifndef TRMMKERNEL
/* add */
xvaddsp vs24, vs24, vs0
@ -2018,7 +2320,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
MULT_APLHA_PART1 vs32, vs40, vs37, vs1
MULT_APLHA_PART2 vs32, vs40, vs37, vs1
/* reconstruct r, i pairs*/
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxperm vs37, vs1, save_permute_1
#else
xxperm vs37, vs1, vs28
#endif
#ifndef TRMMKERNEL
/* add */
xvaddsp vs36, vs36, vs37

View File

@ -30,7 +30,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
static void zscal_kernel_8 (long n, float *x, float alpha_r, float alpha_i)
{
__vector float t0 = {-alpha_i, alpha_i, -alpha_i, alpha_i};
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
__vector unsigned char mask = {4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11};
#else
__vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4};
#endif
__asm__
(
"dcbt 0, %2 \n\t"

View File

@ -39,10 +39,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9)
#include "cswap_microk_power8.c"
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#include "cswap_microk_power10.c"
#elif defined(POWER10)
#include "cswap_microk_power8.c"
#include "cswap_microk_power10.c"
#endif
#endif

View File

@ -49,14 +49,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9)
#include "dasum_microk_power8.c"
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#include "dasum_microk_power10.c"
#elif defined(POWER10)
#include "dasum_microk_power8.c"
#include "dasum_microk_power10.c"
#endif
#endif
#ifndef HAVE_KERNEL_16
static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1)
@ -114,7 +111,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
if ( inc_x == 1 )
{
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#if defined(POWER10)
if ( n >= 32)
{
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;

View File

@ -40,18 +40,27 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
XXSPLTD_S(32,%x9,0) // alpha, alpha
"sldi %6, %13, 3 \n\t" // lda * sizeof (double)
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"xvmuldp 34, 40, 32 \n\t" // x0 * alpha, x1 * alpha
"xvmuldp 35, 41, 32 \n\t" // x2 * alpha, x3 * alpha
#else
"xvmuldp 34, 41, 32 \n\t" // x0 * alpha, x1 * alpha
"xvmuldp 35, 40, 32 \n\t" // x2 * alpha, x3 * alpha
#endif
"add %4, %3, %6 \n\t" // a0 = ap, a1 = a0 + lda
"add %6, %6, %6 \n\t" // 2 * lda
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
XXSPLTD_S(32,34,0) // x0 * alpha, x0 * alpha
XXSPLTD_S(33,34,1) // x1 * alpha, x1 * alpha
XXSPLTD_S(34,35,0) // x2 * alpha, x2 * alpha
XXSPLTD_S(35,35,1) // x3 * alpha, x3 * alpha
#else
XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha
XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha
XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha
XXSPLTD_S(35,35,0) // x3 * alpha, x3 * alpha
#endif
"add %5, %3, %6 \n\t" // a2 = a0 + 2 * lda
"add %6, %4, %6 \n\t" // a3 = a1 + 2 * lda
@ -286,6 +295,16 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y
"add %4, %3, %10 \n\t" // a0 = ap, a1 = a0 + lda
"add %10, %10, %10 \n\t" // 2 * lda
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
XXSPLTD_S(32,34,0) // x0 * alpha, x0 * alpha
XXSPLTD_S(33,34,1) // x1 * alpha, x1 * alpha
XXSPLTD_S(34,35,0) // x2 * alpha, x2 * alpha
XXSPLTD_S(35,35,1) // x3 * alpha, x3 * alpha
XXSPLTD_S(48,39,0) // x6 * alpha, x6 * alpha
XXSPLTD_S(49,39,1) // x7 * alpha, x7 * alpha
XXSPLTD_S(39,38,1) // x5 * alpha, x5 * alpha
XXSPLTD_S(38,38,0) // x4 * alpha, x4 * alpha
#else
XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha
XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha
XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha
@ -294,6 +313,7 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y
XXSPLTD_S(49,39,0) // x7 * alpha, x7 * alpha
XXSPLTD_S(39,38,0) // x5 * alpha, x5 * alpha
XXSPLTD_S(38,38,1) // x4 * alpha, x4 * alpha
#endif
"add %5, %3, %10 \n\t" // a2 = a0 + 2 * lda
"add %6, %4, %10 \n\t" // a3 = a1 + 2 * lda
@ -319,30 +339,69 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y
"one%=: \n\t"
"lxvp 36, 0( %2) \n\t" // y0, y1
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"xvmaddadp 36, 40, 32 \n\t"
"xvmaddadp 37, 41, 32 \n\t"
#else
"xvmaddadp 36, 40, 34 \n\t"
"xvmaddadp 37, 41, 34 \n\t"
#endif
"lxvpx 40, %3, %11 \n\t" // a0[0], a0[1]
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"xvmaddadp 36, 42, 33 \n\t"
"xvmaddadp 37, 43, 33 \n\t"
#else
"xvmaddadp 36, 42, 35 \n\t"
"xvmaddadp 37, 43, 35 \n\t"
#endif
"lxvpx 42, %4, %11 \n\t" // a1[0], a1[1]
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"xvmaddadp 36, 44, 34 \n\t"
"xvmaddadp 37, 45, 34 \n\t"
#else
"xvmaddadp 36, 44, 32 \n\t"
"xvmaddadp 37, 45, 32 \n\t"
#endif
"lxvpx 44, %5, %11 \n\t" // a2[0], a2[1]
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"xvmaddadp 36, 46, 35 \n\t"
"xvmaddadp 37, 47, 35 \n\t"
#else
"xvmaddadp 36, 46, 33 \n\t"
"xvmaddadp 37, 47, 33 \n\t"
#endif
"lxvpx 46, %6, %11 \n\t" // a3[0], a3[1]
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"xvmaddadp 36, 50, 38 \n\t"
"xvmaddadp 37, 51, 38 \n\t"
#else
"xvmaddadp 36, 50, 48 \n\t"
"xvmaddadp 37, 51, 48 \n\t"
#endif
"lxvpx 50, %7, %11 \n\t" // a4[0]
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"xvmaddadp 36, 52, 39 \n\t"
"xvmaddadp 37, 53, 39 \n\t"
#else
"xvmaddadp 36, 52, 49 \n\t"
"xvmaddadp 37, 53, 49 \n\t"
#endif
"lxvpx 52, %8, %11 \n\t" // a5[0]
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"xvmaddadp 36, 54, 48 \n\t"
"xvmaddadp 37, 55, 48 \n\t"
#else
"xvmaddadp 36, 54, 38 \n\t"
"xvmaddadp 37, 55, 38 \n\t"
#endif
"lxvpx 54, %9, %11 \n\t" // a6[0]
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"xvmaddadp 36, 56, 49 \n\t"
"xvmaddadp 37, 57, 49 \n\t"
#else
"xvmaddadp 36, 56, 39 \n\t"
"xvmaddadp 37, 57, 39 \n\t"
#endif
"lxvpx 56, %10, %11 \n\t" // a7[0]
"addi %11, %11, 32 \n\t"
@ -355,6 +414,24 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y
"two%=: \n\t"
"lxvp 36, 0( %2) \n\t" // y0, y1
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"xvmaddadp 36, 40, 32 \n\t"
"xvmaddadp 37, 41, 32 \n\t"
"xvmaddadp 36, 42, 33 \n\t"
"xvmaddadp 37, 43, 33 \n\t"
"xvmaddadp 36, 44, 34 \n\t"
"xvmaddadp 37, 45, 34 \n\t"
"xvmaddadp 36, 46, 35 \n\t"
"xvmaddadp 37, 47, 35 \n\t"
"xvmaddadp 36, 50, 38 \n\t"
"xvmaddadp 37, 51, 38 \n\t"
"xvmaddadp 36, 52, 39 \n\t"
"xvmaddadp 37, 53, 39 \n\t"
"xvmaddadp 36, 54, 48 \n\t"
"xvmaddadp 37, 55, 48 \n\t"
"xvmaddadp 36, 56, 49 \n\t"
"xvmaddadp 37, 57, 49 \n\t"
#else
"xvmaddadp 36, 40, 34 \n\t"
"xvmaddadp 37, 41, 34 \n\t"
"xvmaddadp 36, 42, 35 \n\t"
@ -371,6 +448,7 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y
"xvmaddadp 37, 55, 38 \n\t"
"xvmaddadp 36, 56, 39 \n\t"
"xvmaddadp 37, 57, 39 \n\t"
#endif
"stxvp 36, 0( %2) \n\t" // y0, y1
:

View File

@ -279,34 +279,58 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
"lxvp 40, 32(%[y]) \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
XXMRGHD_S(42,34,35)
XXMRGLD_S(43,34,35)
XXMRGHD_S(44,4,5)
XXMRGLD_S(45,4,5)
#else
XXMRGLD_S(42,35,34)
XXMRGHD_S(43,35,34)
XXMRGLD_S(44,5,4)
XXMRGHD_S(45,5,4)
#endif
"xvadddp 42,42,43 \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
XXMRGHD_S(46,6,7)
XXMRGLD_S(47,6,7)
#else
XXMRGLD_S(46,7,6)
XXMRGHD_S(47,7,6)
#endif
"xvadddp 44,44,45 \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
XXMRGHD_S(48,8,9)
XXMRGLD_S(49,8,9)
#else
XXMRGLD_S(48,9,8)
XXMRGHD_S(49,9,8)
#endif
"xvadddp 46,46,47 \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"xvmaddadp 38,42,36 \n\t"
"xvmaddadp 39,44,36 \n\t"
#else
"xvmaddadp 39,42,36 \n\t"
"xvmaddadp 38,44,36 \n\t"
#endif
"xvadddp 48,48,49 \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"xvmaddadp 41,48,36 \n\t"
#else
"xvmaddadp 41,46,36 \n\t"
#endif
"stxvp 38, 0(%[y]) \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"xvmaddadp 40,46,36 \n\t"
#else
"xvmaddadp 40,48,36 \n\t"
#endif
"stxvp 40, 32(%[y]) \n\t"
: [memy] "+m" (*(double (*)[8])y),

View File

@ -42,10 +42,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9)
#include "drot_microk_power8.c"
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#include "drot_microk_power10.c"
#elif defined(POWER10)
#include "drot_microk_power8.c"
#include "drot_microk_power10.c"
#endif
#endif
@ -110,8 +108,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT *x1=x;
FLOAT *y1=y;
FLOAT temp;
if ( n <= 0 ) return(0);
@ -119,7 +115,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
if ( (inc_x == 1) && (inc_y == 1) )
{
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#if defined(POWER10)
if ( n >= 16 )
{
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;
@ -139,7 +135,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
BLASLONG n1 = n & -16;
if ( n1 > 0 )
{
drot_kernel_16(n1, x1, y1, c, s);
drot_kernel_16(n1, x, y, c, s);
i=n1;
}
#endif

View File

@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9)
#include "dscal_microk_power8.c"
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#include "dscal_microk_power10.c"
#elif defined(POWER10)
#include "dscal_microk_power8.c"
#include "dscal_microk_power10.c"
#endif
#endif
@ -104,7 +102,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
if ( da == 0.0 )
{
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#if defined(POWER10)
if ( n >= 16 )
{
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;
@ -138,7 +136,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
else
{
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#if defined(POWER10)
if ( n >= 16 )
{
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;

View File

@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9)
#include "dswap_microk_power8.c"
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#include "swap_microk_power10.c"
#elif defined(POWER10)
#include "dswap_microk_power8.c"
#include "swap_microk_power10.c"
#endif
#endif
@ -119,7 +117,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
if ( (inc_x == 1) && (inc_y == 1 ))
{
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#if defined(POWER10)
if ( n >= 32 )
{
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;

View File

@ -330,10 +330,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
if (inc_x == 1) {
BLASLONG n1 = n & -32;
#if defined(_CALL_ELF) && (_CALL_ELF == 2)
#if defined(__VEC__) || defined(__ALTIVEC__)
BLASLONG n1 = n & -32;
if (n1 > 0) {
max = diamax_kernel_32(n1, x, &maxf);

View File

@ -49,10 +49,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9)
#include "sasum_microk_power8.c"
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#include "sasum_microk_power10.c"
#elif defined(POWER10)
#include "sasum_microk_power8.c"
#include "sasum_microk_power10.c"
#endif
#endif
@ -114,7 +112,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
if ( inc_x == 1 )
{
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#if defined(POWER10)
if ( n >= 32 )
{
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7;

View File

@ -42,10 +42,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9)
#include "srot_microk_power8.c"
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#include "srot_microk_power10.c"
#elif defined(POWER10)
#include "srot_microk_power8.c"
#include "srot_microk_power10.c"
#endif
#endif
@ -119,7 +117,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
if ( (inc_x == 1) && (inc_y == 1) )
{
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#if defined(POWER10)
if ( n >= 16 )
{
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7;

View File

@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9)
#include "sscal_microk_power8.c"
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#include "sscal_microk_power10.c"
#elif defined(POWER10)
#include "sscal_microk_power8.c"
#include "sscal_microk_power10.c"
#endif
#endif
@ -106,7 +104,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
if ( da == 0.0 )
{
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#if defined(POWER10)
if ( n >= 32 )
{
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7;
@ -140,7 +138,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
else
{
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#if defined(POWER10)
if ( n >= 32 )
{
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7;

View File

@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9)
#include "sswap_microk_power8.c"
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#include "swap_microk_power10.c"
#elif defined(POWER10)
#include "sswap_microk_power8.c"
#include "swap_microk_power10.c"
#endif
#endif
@ -119,7 +117,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
if ( (inc_x == 1) && (inc_y == 1 ))
{
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#if defined(POWER10)
if ( n >= 64 )
{
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7;

View File

@ -389,7 +389,6 @@ static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b,
vector FLOAT *Vc6 = (vector FLOAT *) c6;
vector FLOAT *Vc7 = (vector FLOAT *) c7;
vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7;
int j;
b[120] = (c0[15] *= a[255]);
b[121] = (c1[15] *= a[255]);

View File

@ -390,7 +390,6 @@ static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b,
vector FLOAT *Vc6 = (vector FLOAT *) c6;
vector FLOAT *Vc7 = (vector FLOAT *) c7;
vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7;
int j;
b[0] = (c0[0] *= a[0]);
b[1] = (c1[0] *= a[0]);

View File

@ -30,9 +30,17 @@ static void zaxpy_kernel_4 (long n, double *x, double *y,
double alpha_r, double alpha_i)
{
#if !defined(CONJ)
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
static const double mvec[2] = { -1.0, 1.0 };
#else
static const double mvec[2] = { 1.0, -1.0 };
#endif
#else
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
static const double mvec[2] = { 1.0, -1.0 };
#else
static const double mvec[2] = { -1.0, 1.0 };
#endif
#endif
const double *mvecp = mvec;

View File

@ -147,13 +147,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
std r0, FLINK_SAVE(SP)
#if defined(linux) || defined(__FreeBSD__)
#if defined(linux) || defined(__FreeBSD__) || defined(_AIX)
ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
#endif
#ifdef TRMMKERNEL
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
#if (defined(linux) || defined(__FreeBSD__) || defined(_AIX)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
#endif
#endif

View File

@ -41,23 +41,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef TRMMKERNEL
lxv \VS_TEMP1, DISPX(\LOFFSET)(\REG)
lxv \VS_TEMP2, DISPX(\LOFFSET+16)(\REG)
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxmrghd \VS_OUT1,\VS_TEMP1,\VS_TEMP2
xxmrgld \VS_OUT2,\VS_TEMP1,\VS_TEMP2
#else
xxmrgld \VS_OUT1,\VS_TEMP1,\VS_TEMP2
xxmrghd \VS_OUT2,\VS_TEMP1,\VS_TEMP2
#endif
#endif
.endm
/*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/
.macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxmrghd \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/
xxmrgld \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/
#else
xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/
xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/
#endif
.endm
/*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/
.macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxmrghd \VSOUT1, \VSIN1,\VSIN2 /* real*imag */
xxmrgld \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/
#else
xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*imag */
xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/
#endif
.endm
/* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/
@ -103,8 +118,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxmrghd \VSOUT1,\VSIN1,\VSIN2
xxmrgld \VSOUT2,\VSIN1,\VSIN2
#else
xxmrghd \VSOUT1,\VSIN2,\VSIN1
xxmrgld \VSOUT2,\VSIN2,\VSIN1
#endif
.endm
@ -186,15 +206,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs34,vs35
#ifndef TRMMKERNEL
lxv vs50, (\LOFFSET)(\BASE_REG)
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxmrghd vs46,vs50,vs50
xxmrgld vs47,vs50,vs50
#else
xxmrgld vs46,vs50,vs50
xxmrghd vs47,vs50,vs50
#endif
#endif
RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes2,vs36,vs37
AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37
MULT_APLHA_PART1 vs34,vs36, vs46,vs47
MULT_APLHA_PART2 vs34,vs36, vs46,vs47
UNPACK_FOR_STORE vs46,vs47,vs39,vs41
#if (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
xxmrghd vs39,vs47,vs46
#endif
stxv vs39, (\LOFFSET)(\BASE_REG)
.endm
@ -232,6 +259,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lxvp vs44, DISP16(\Index,192)(AO) // load real,imag from A
lxvp vs46, DISP16(\Index,224)(AO) // load real,imag from A
lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf64gerpp 0, vs32, vs48
xvf64gerpp 1, vs34, vs48
xvf64gerpp 2, vs36, vs48
xvf64gerpp 3, vs38, vs48
xvf64gerpp 4, vs32, vs49
xvf64gerpp 5, vs34, vs49
xvf64gerpp 6, vs36, vs49
xvf64gerpp 7, vs38, vs49
#else
xvf64gerpp 0, vs32, vs49
xvf64gerpp 1, vs34, vs49
xvf64gerpp 2, vs36, vs49
@ -240,11 +277,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvf64gerpp 5, vs34, vs48
xvf64gerpp 6, vs36, vs48
xvf64gerpp 7, vs38, vs48
#endif
lxvp vs32, DISP16(\Index, 256)(AO) // load real,imag from A
lxvp vs34, DISP16(\Index, 288)(AO) // load real,imag from A
lxvp vs36, DISP16(\Index, 320)(AO) // load real,imag from A
lxvp vs38, DISP16(\Index, 352)(AO) // load real,imag from A
lxvp vs48, DISP4(\Index, 64)(BO) // load real imag from B
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf64gerpp 0, vs40, vs50
xvf64gerpp 1, vs42, vs50
xvf64gerpp 2, vs44, vs50
xvf64gerpp 3, vs46, vs50
xvf64gerpp 4, vs40, vs51
xvf64gerpp 5, vs42, vs51
xvf64gerpp 6, vs44, vs51
xvf64gerpp 7, vs46, vs51
#else
xvf64gerpp 0, vs40, vs51
xvf64gerpp 1, vs42, vs51
xvf64gerpp 2, vs44, vs51
@ -253,6 +301,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvf64gerpp 5, vs42, vs50
xvf64gerpp 6, vs44, vs50
xvf64gerpp 7, vs46, vs50
#endif
.if \IsLast==1
addi AO, AO, DISP16(\Index,256)
addi BO, BO, DISP4(\Index,64)
@ -261,6 +310,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro LOAD_END_2x8 OffsetA,OffsetB
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf64gerpp 0, vs32, vs48
xvf64gerpp 1, vs34, vs48
xvf64gerpp 2, vs36, vs48
xvf64gerpp 3, vs38, vs48
xvf64gerpp 4, vs32, vs49
xvf64gerpp 5, vs34, vs49
xvf64gerpp 6, vs36, vs49
xvf64gerpp 7, vs38, vs49
#else
xvf64gerpp 0, vs32, vs49
xvf64gerpp 1, vs34, vs49
xvf64gerpp 2, vs36, vs49
@ -269,6 +328,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvf64gerpp 5, vs34, vs48
xvf64gerpp 6, vs36, vs48
xvf64gerpp 7, vs38, vs48
#endif
addi BO, BO, \OffsetB
addi AO, AO, \OffsetA
.endm
@ -305,7 +365,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxpermdi vs45, vs12, vs13, 0b10
xxpermdi vs46, vs14, vs15, 0b01
xxpermdi vs47, vs14, vs15, 0b10
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxlor vs0, vs32, vs32
xxlor vs1, vs33, vs33
xxlor vs2, vs34, vs34
xxlor vs3, vs35, vs35
xxlor vs4, vs36, vs36
xxlor vs5, vs37, vs37
xxlor vs6, vs38, vs38
xxlor vs7, vs39, vs39
xxlor vs8, vs40, vs40
xxlor vs9, vs41, vs41
xxlor vs10, vs42, vs42
xxlor vs11, vs43, vs43
xxlor vs12, vs44, vs44
xxlor vs13, vs45, vs45
xxlor vs14, vs46, vs46
xxlor vs15, vs47, vs47
#else
xxlor vs2, vs32, vs32
xxlor vs3, vs33, vs33
xxlor vs0, vs34, vs34
@ -322,7 +399,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxlor vs15, vs45, vs45
xxlor vs12, vs46, vs46
xxlor vs13, vs47, vs47
#endif
xxpermdi vs32, vs16, vs17, 0b01
xxpermdi vs33, vs16, vs17, 0b10
xxpermdi vs34, vs18, vs19, 0b01
@ -339,7 +416,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxpermdi vs45, vs28, vs29, 0b10
xxpermdi vs46, vs30, vs31, 0b01
xxpermdi vs47, vs30, vs31, 0b10
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxlor vs16, vs32, vs32
xxlor vs17, vs33, vs33
xxlor vs18, vs34, vs34
xxlor vs19, vs35, vs35
xxlor vs20, vs36, vs36
xxlor vs21, vs37, vs37
xxlor vs22, vs38, vs38
xxlor vs23, vs39, vs39
xxlor vs24, vs40, vs40
xxlor vs25, vs41, vs41
xxlor vs26, vs42, vs42
xxlor vs27, vs43, vs43
xxlor vs28, vs44, vs44
xxlor vs29, vs45, vs45
xxlor vs30, vs46, vs46
xxlor vs31, vs47, vs47
#else
xxlor vs18, vs32, vs32
xxlor vs19, vs33, vs33
xxlor vs16, vs34, vs34
@ -356,7 +450,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxlor vs31, vs45, vs45
xxlor vs28, vs46, vs46
xxlor vs29, vs47, vs47
#endif
SAVE8 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0
SAVE8 vs16,vs17,vs18,vs19,vs20,vs21,vs22,vs23,vs24,vs25,vs26,vs27,vs28,vs29,vs30,vs31,T1,0
addi CO, CO, 128
@ -388,17 +482,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lxvp vs40, DISP8(\Index, 64)(AO) // load real,imag from A
lxvp vs42, DISP8(\Index, 96)(AO) // load real,imag from A
lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B
xvf64gerpp 0, vs32, vs49
xvf64gerpp 1, vs34, vs49
xvf64gerpp 2, vs32, vs48
xvf64gerpp 3, vs34, vs48
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf64gerpp 0, vs32, vs48
xvf64gerpp 1, vs34, vs48
xvf64gerpp 2, vs32, vs49
xvf64gerpp 3, vs34, vs49
#else
xvf64gerpp 0, vs32, vs49
xvf64gerpp 1, vs34, vs49
xvf64gerpp 2, vs32, vs48
xvf64gerpp 3, vs34, vs48
#endif
lxvp vs32, DISP8(\Index, 128)(AO) // load real,imag from A
lxvp vs34, DISP8(\Index, 160)(AO) // load real,imag from A
lxvp vs48, DISP4(\Index, 64)(BO) // load real,imag from B
xvf64gerpp 0, vs40, vs51
xvf64gerpp 1, vs42, vs51
xvf64gerpp 2, vs40, vs50
xvf64gerpp 3, vs42, vs50
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf64gerpp 0, vs40, vs50
xvf64gerpp 1, vs42, vs50
xvf64gerpp 2, vs40, vs51
xvf64gerpp 3, vs42, vs51
#else
xvf64gerpp 0, vs40, vs51
xvf64gerpp 1, vs42, vs51
xvf64gerpp 2, vs40, vs50
xvf64gerpp 3, vs42, vs50
#endif
.if \IsLast==1
addi AO, AO, DISP8(\Index,128)
addi BO, BO, DISP4(\Index,64)
@ -407,10 +515,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro LOAD_END_2x4 OffsetA, OffsetB
xvf64gerpp 0, vs32, vs49
xvf64gerpp 1, vs34, vs49
xvf64gerpp 2, vs32, vs48
xvf64gerpp 3, vs34, vs48
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf64gerpp 0, vs32, vs48
xvf64gerpp 1, vs34, vs48
xvf64gerpp 2, vs32, vs49
xvf64gerpp 3, vs34, vs49
#else
xvf64gerpp 0, vs32, vs49
xvf64gerpp 1, vs34, vs49
xvf64gerpp 2, vs32, vs48
xvf64gerpp 3, vs34, vs48
#endif
addi BO, BO, \OffsetB
addi AO, AO, \OffsetA
.endm
@ -443,7 +558,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxpermdi vs45, vs12, vs13, 0b10
xxpermdi vs46, vs14, vs15, 0b01
xxpermdi vs47, vs14, vs15, 0b10
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxlor vs0, vs32, vs32
xxlor vs1, vs33, vs33
xxlor vs2, vs34, vs34
xxlor vs3, vs35, vs35
xxlor vs4, vs36, vs36
xxlor vs5, vs37, vs37
xxlor vs6, vs38, vs38
xxlor vs7, vs39, vs39
xxlor vs8, vs40, vs40
xxlor vs9, vs41, vs41
xxlor vs10, vs42, vs42
xxlor vs11, vs43, vs43
xxlor vs12, vs44, vs44
xxlor vs13, vs45, vs45
xxlor vs14, vs46, vs46
xxlor vs15, vs47, vs47
#else
xxlor vs2, vs32, vs32
xxlor vs3, vs33, vs33
xxlor vs0, vs34, vs34
@ -460,7 +592,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxlor vs15, vs45, vs45
xxlor vs12, vs46, vs46
xxlor vs13, vs47, vs47
#endif
SAVE4 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0
SAVE4 vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,T1,0
addi CO, CO, 64
@ -488,12 +620,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL2x2_2 Index, IsLast
lxvp vs40, DISP4(\Index, 32)(AO) // load real,imag from A
lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B
xvf64gerpp 0, vs32, vs49
xvf64gerpp 1, vs32, vs48
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf64gerpp 0, vs32, vs48
xvf64gerpp 1, vs32, vs49
#else
xvf64gerpp 0, vs32, vs49
xvf64gerpp 1, vs32, vs48
#endif
lxvp vs32, DISP4(\Index, 64)(AO) // load real,imag from A
lxvp vs48, DISP4(\Index, 64)(BO) // load real imag from B
xvf64gerpp 0, vs40, vs51
xvf64gerpp 1, vs40, vs50
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf64gerpp 0, vs40, vs50
xvf64gerpp 1, vs40, vs51
#else
xvf64gerpp 0, vs40, vs51
xvf64gerpp 1, vs40, vs50
#endif
.if \IsLast==1
addi AO, AO, DISP4(\Index,64)
addi BO, BO, DISP4(\Index,64)
@ -502,8 +644,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro LOAD_END_2x2 OffsetA,OffsetB
xvf64gerpp 0, vs32, vs49
xvf64gerpp 1, vs32, vs48
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf64gerpp 0, vs32, vs48
xvf64gerpp 1, vs32, vs49
#else
xvf64gerpp 0, vs32, vs49
xvf64gerpp 1, vs32, vs48
#endif
addi BO, BO, \OffsetB
addi AO, AO, \OffsetA
.endm
@ -526,7 +673,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxpermdi vs37, vs4, vs5, 0b10
xxpermdi vs38, vs6, vs7, 0b01
xxpermdi vs39, vs6, vs7, 0b10
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxlor vs0, vs32, vs32
xxlor vs1, vs33, vs33
xxlor vs2, vs34, vs34
xxlor vs3, vs35, vs35
xxlor vs4, vs36, vs36
xxlor vs5, vs37, vs37
xxlor vs6, vs38, vs38
xxlor vs7, vs39, vs39
#else
xxlor vs2, vs32, vs32
xxlor vs3, vs33, vs33
xxlor vs0, vs34, vs34
@ -535,7 +691,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxlor vs7, vs37, vs37
xxlor vs4, vs38, vs38
xxlor vs5, vs39, vs39
#endif
SAVE2 vs0,vs1,vs2,vs3,CO,0
SAVE2 vs4,vs5,vs6,vs7,T1,0
addi CO, CO, 32
@ -702,14 +858,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lxvp vs44, DISP16(\Index, 192)(AO) // load real,imag from A
lxvp vs46, DISP16(\Index, 224)(AO) // load real,imag from A
lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B
xvf64gerpp 0, vs32, vs49
xvf64gerpp 1, vs34, vs49
xvf64gerpp 2, vs36, vs49
xvf64gerpp 3, vs38, vs49
xvf64gerpp 0, vs40, vs48
xvf64gerpp 1, vs42, vs48
xvf64gerpp 2, vs44, vs48
xvf64gerpp 3, vs46, vs48
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf64gerpp 0, vs32, vs48
xvf64gerpp 1, vs34, vs48
xvf64gerpp 2, vs36, vs48
xvf64gerpp 3, vs38, vs48
xvf64gerpp 0, vs40, vs49
xvf64gerpp 1, vs42, vs49
xvf64gerpp 2, vs44, vs49
xvf64gerpp 3, vs46, vs49
#else
xvf64gerpp 0, vs32, vs49
xvf64gerpp 1, vs34, vs49
xvf64gerpp 2, vs36, vs49
xvf64gerpp 3, vs38, vs49
xvf64gerpp 0, vs40, vs48
xvf64gerpp 1, vs42, vs48
xvf64gerpp 2, vs44, vs48
xvf64gerpp 3, vs46, vs48
#endif
.if \IsLast==1
addi AO, AO, DISP16(\Index,256)
addi BO, BO, DISP2(\Index,32)
@ -758,7 +925,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxpermdi vs45, vs12, vs13, 0b10
xxpermdi vs46, vs14, vs15, 0b01
xxpermdi vs47, vs14, vs15, 0b10
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxlor vs0, vs32, vs32
xxlor vs1, vs33, vs33
xxlor vs2, vs34, vs34
xxlor vs3, vs35, vs35
xxlor vs4, vs36, vs36
xxlor vs5, vs37, vs37
xxlor vs6, vs38, vs38
xxlor vs7, vs39, vs39
xxlor vs8, vs40, vs40
xxlor vs9, vs41, vs41
xxlor vs10, vs42, vs42
xxlor vs11, vs43, vs43
xxlor vs12, vs44, vs44
xxlor vs13, vs45, vs45
xxlor vs14, vs46, vs46
xxlor vs15, vs47, vs47
#else
xxlor vs2, vs32, vs32
xxlor vs3, vs33, vs33
xxlor vs0, vs34, vs34
@ -775,7 +959,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxlor vs15, vs45, vs45
xxlor vs12, vs46, vs46
xxlor vs13, vs47, vs47
#endif
SAVE8 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0
addi CO, CO, 128
.endm
@ -799,10 +983,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lxvp vs40, DISP8(\Index, 64)(AO) // load real,imag from A
lxvp vs42, DISP8(\Index, 96)(AO) // load real,imag from A
lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B
xvf64gerpp 0, vs32, vs49
xvf64gerpp 1, vs34, vs49
xvf64gerpp 0, vs40, vs48
xvf64gerpp 1, vs42, vs48
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf64gerpp 0, vs32, vs48
xvf64gerpp 1, vs34, vs48
xvf64gerpp 0, vs40, vs49
xvf64gerpp 1, vs42, vs49
#else
xvf64gerpp 0, vs32, vs49
xvf64gerpp 1, vs34, vs49
xvf64gerpp 0, vs40, vs48
xvf64gerpp 1, vs42, vs48
#endif
.if \IsLast==1
addi AO, AO, DISP8(\Index,128)
addi BO, BO, DISP2(\Index,32)
@ -837,7 +1028,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxpermdi vs37, vs4, vs5, 0b10
xxpermdi vs38, vs6, vs7, 0b01
xxpermdi vs39, vs6, vs7, 0b10
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxlor vs0, vs32, vs32
xxlor vs1, vs33, vs33
xxlor vs2, vs34, vs34
xxlor vs3, vs35, vs35
xxlor vs4, vs36, vs36
xxlor vs5, vs37, vs37
xxlor vs6, vs38, vs38
xxlor vs7, vs39, vs39
#else
xxlor vs2, vs32, vs32
xxlor vs3, vs33, vs33
xxlor vs0, vs34, vs34
@ -846,7 +1046,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxlor vs7, vs37, vs37
xxlor vs4, vs38, vs38
xxlor vs5, vs39, vs39
#endif
SAVE4 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0
addi CO, CO, 64
.endm
@ -867,8 +1067,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lxvp vs32, DISP4(\Index, 0)(AO) // load real,imag from A
lxvp vs40, DISP4(\Index, 32)(AO) // load real,imag from A
lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B
xvf64gerpp 0, vs32, vs49
xvf64gerpp 0, vs40, vs48
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf64gerpp 0, vs32, vs48
xvf64gerpp 0, vs40, vs49
#else
xvf64gerpp 0, vs32, vs49
xvf64gerpp 0, vs40, vs48
#endif
.if \IsLast==1
addi AO, AO, DISP4(\Index,64)
addi BO, BO, DISP2(\Index,32)
@ -896,11 +1101,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxpermdi vs33, vs0, vs1, 0b10
xxpermdi vs34, vs2, vs3, 0b01
xxpermdi vs35, vs2, vs3, 0b10
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxlor vs0, vs32, vs32
xxlor vs1, vs33, vs33
xxlor vs2, vs34, vs34
xxlor vs3, vs35, vs35
#else
xxlor vs2, vs32, vs32
xxlor vs3, vs33, vs33
xxlor vs0, vs34, vs34
xxlor vs1, vs35, vs35
#endif
SAVE2 vs0,vs1,vs2,vs3,CO,0
addi CO, CO, 32

View File

@ -607,7 +607,6 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) {
BLASLONG i;
BLASLONG j;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;

View File

@ -738,7 +738,6 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) {
BLASLONG i;
BLASLONG j;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;

View File

@ -43,7 +43,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#elif HAVE_KERNEL_4x4_VEC
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#if defined(POWER10)
typedef __vector unsigned char vec_t;
typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));

View File

@ -43,16 +43,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(DOUBLE)
#include "zscal_microk_power8.c"
#endif
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#elif defined(POWER10)
#if defined(DOUBLE)
#include "zscal_microk_power10.c"
#else
#include "cscal_microk_power10.c"
#endif
#elif defined(POWER10)
#if defined(DOUBLE)
#include "zscal_microk_power8.c"
#endif
#endif
#endif

View File

@ -42,7 +42,11 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
"xsnegdp 33, %x10 \n\t" // -alpha_i
XXSPLTD_S(32,%x9,0) // alpha_r , alpha_r
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
XXMRGHD_S(33,33, %x10) // -alpha_i , alpha_i
#else
XXMRGHD_S(33,%x10, 33) // -alpha_i , alpha_i
#endif
"lxvp 40, 0(%2) \n\t"
"lxvp 42, 32(%2) \n\t"
@ -97,10 +101,17 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
"xvadddp 49, 49, 39 \n\t"
"xvadddp 50, 50, %x3 \n\t"
"xvadddp 51, 51, %x4 \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"stxv 48, 0(%2) \n\t"
"stxv 49, 16(%2) \n\t"
"stxv 50, 32(%2) \n\t"
"stxv 51, 48(%2) \n\t"
#else
"stxv 49, 0(%2) \n\t"
"stxv 48, 16(%2) \n\t"
"stxv 51, 32(%2) \n\t"
"stxv 50, 48(%2) \n\t"
#endif
"xvadddp 34, 34, %x5 \n\t"
@ -109,12 +120,17 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
"xvadddp 36, 36, %x7 \n\t"
"xvadddp 37, 37, %x8 \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"stxv 34, 64(%2) \n\t"
"stxv 35, 80(%2) \n\t"
"stxv 36, 96(%2) \n\t"
"stxv 37, 112(%2) \n\t"
#else
"stxv 35, 64(%2) \n\t"
"stxv 34, 80(%2) \n\t"
"stxv 37, 96(%2) \n\t"
"stxv 36, 112(%2) \n\t"
#endif
"addi %2, %2, 128 \n\t"
"addic. %1, %1, -8 \n\t"
@ -155,23 +171,34 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
"xvadddp 50, 50, %x3 \n\t"
"xvadddp 51, 51, %x4 \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"stxv 48, 0(%2) \n\t"
"stxv 49, 16(%2) \n\t"
"stxv 50, 32(%2) \n\t"
"stxv 51, 48(%2) \n\t"
#else
"stxv 49, 0(%2) \n\t"
"stxv 48, 16(%2) \n\t"
"stxv 51, 32(%2) \n\t"
"stxv 50, 48(%2) \n\t"
#endif
"xvadddp 34, 34, %x5 \n\t"
"xvadddp 35, 35, %x6 \n\t"
"xvadddp 36, 36, %x7 \n\t"
"xvadddp 37, 37, %x8 \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"stxv 34, 64(%2) \n\t"
"stxv 35, 80(%2) \n\t"
"stxv 36, 96(%2) \n\t"
"stxv 37, 112(%2) \n\t"
#else
"stxv 35, 64(%2) \n\t"
"stxv 34, 80(%2) \n\t"
"stxv 37, 96(%2) \n\t"
"stxv 36, 112(%2) \n\t"
#endif
"#n=%1 x=%0=%2 alpha=(%9,%10) \n"
:
"+m" (*x),

View File

@ -39,10 +39,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9)
#include "zswap_microk_power8.c"
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
#include "cswap_microk_power10.c"
#elif defined(POWER10)
#include "zswap_microk_power8.c"
#include "cswap_microk_power10.c"
#endif
#endif

View File

@ -9,3 +9,14 @@ SBGEMM_SMALL_K_TN = sbgemm_small_kernel_tn_cooperlake.c
SBGEMM_SMALL_K_B0_TN = sbgemm_small_kernel_tn_cooperlake.c
SBGEMM_SMALL_K_TT = sbgemm_small_kernel_tt_cooperlake.c
SBGEMM_SMALL_K_B0_TT = sbgemm_small_kernel_tt_cooperlake.c
SBGEMM_BETA = sgemm_beta_skylakex.c
SBGEMMKERNEL = sbgemm_kernel_16x4_cooperlake.c
SBGEMMINCOPY = sbgemm_ncopy_16_cooperlake.c
SBGEMMITCOPY = sbgemm_tcopy_16_cooperlake.c
SBGEMMONCOPY = sbgemm_ncopy_4_cooperlake.c
SBGEMMOTCOPY = sbgemm_tcopy_4_cooperlake.c
SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX)
SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX)
SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX)
SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX)

View File

@ -56,25 +56,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define BF16_MATRIX_LOAD_8x16(regArray, a, lda, idx_m, idx_n) \
regArray##_0 = _mm256_loadu_si256(&a[(idx_m+0)*lda + idx_n]); \
regArray##_1 = _mm256_loadu_si256(&a[(idx_m+1)*lda + idx_n]); \
regArray##_2 = _mm256_loadu_si256(&a[(idx_m+2)*lda + idx_n]); \
regArray##_3 = _mm256_loadu_si256(&a[(idx_m+3)*lda + idx_n]); \
regArray##_4 = _mm256_loadu_si256(&a[(idx_m+4)*lda + idx_n]); \
regArray##_5 = _mm256_loadu_si256(&a[(idx_m+5)*lda + idx_n]); \
regArray##_6 = _mm256_loadu_si256(&a[(idx_m+6)*lda + idx_n]); \
regArray##_7 = _mm256_loadu_si256(&a[(idx_m+7)*lda + idx_n]);
regArray##_0 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+0)*lda + idx_n])); \
regArray##_1 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+1)*lda + idx_n])); \
regArray##_2 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+2)*lda + idx_n])); \
regArray##_3 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+3)*lda + idx_n])); \
regArray##_4 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+4)*lda + idx_n])); \
regArray##_5 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+5)*lda + idx_n])); \
regArray##_6 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+6)*lda + idx_n])); \
regArray##_7 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+7)*lda + idx_n]));
#define BF16_MATRIX_LOAD_8x8(regArray, a, lda, idx_m, idx_n) \
regArray##_0 = _mm_loadu_si128(&a[(idx_m+0)*lda + idx_n]); \
regArray##_1 = _mm_loadu_si128(&a[(idx_m+1)*lda + idx_n]); \
regArray##_2 = _mm_loadu_si128(&a[(idx_m+2)*lda + idx_n]); \
regArray##_3 = _mm_loadu_si128(&a[(idx_m+3)*lda + idx_n]); \
regArray##_4 = _mm_loadu_si128(&a[(idx_m+4)*lda + idx_n]); \
regArray##_5 = _mm_loadu_si128(&a[(idx_m+5)*lda + idx_n]); \
regArray##_6 = _mm_loadu_si128(&a[(idx_m+6)*lda + idx_n]); \
regArray##_7 = _mm_loadu_si128(&a[(idx_m+7)*lda + idx_n]);
regArray##_0 = _mm_loadu_si128((__m128i *)(&a[(idx_m+0)*lda + idx_n])); \
regArray##_1 = _mm_loadu_si128((__m128i *)(&a[(idx_m+1)*lda + idx_n])); \
regArray##_2 = _mm_loadu_si128((__m128i *)(&a[(idx_m+2)*lda + idx_n])); \
regArray##_3 = _mm_loadu_si128((__m128i *)(&a[(idx_m+3)*lda + idx_n])); \
regArray##_4 = _mm_loadu_si128((__m128i *)(&a[(idx_m+4)*lda + idx_n])); \
regArray##_5 = _mm_loadu_si128((__m128i *)(&a[(idx_m+5)*lda + idx_n])); \
regArray##_6 = _mm_loadu_si128((__m128i *)(&a[(idx_m+6)*lda + idx_n])); \
regArray##_7 = _mm_loadu_si128((__m128i *)(&a[(idx_m+7)*lda + idx_n]));
#define BF16_MATRIX_LOAD_1x32(regArray, a, lda, idx_m, idx_n) \
@ -153,11 +153,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define BF16_VECTOR_LOAD_1x16(reg, x, idx_n) \
reg = _mm256_loadu_si256(x + idx_n);
reg = _mm256_loadu_si256((__m256i *)(x + idx_n));
#define BF16_VECTOR_LOAD_1x8(reg, x, idx_n) \
reg = _mm_loadu_si128(x + idx_n);
reg = _mm_loadu_si128((__m128i *)(x + idx_n));
#define BF16_VECTOR_MASKZ_LOAD_1x32(reg, x, idx_n, mask) \

View File

@ -15,7 +15,7 @@ static FLOAT casum_kernel(BLASLONG n, FLOAT *x)
if (n2 < 64) {
__m128 accum_10, accum_11, accum_12, accum_13;
__m128 abs_mask1;
__m128 abs_mask1 = abs_mask1;
accum_10 = _mm_setzero_ps();
accum_11 = _mm_setzero_ps();

View File

@ -38,10 +38,10 @@ static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1)
__m256i abs_mask = _mm256_set1_epi64x(0x7fffffffffffffff);
for (i = 0; i < tail_index_AVX2; i += 16) {
accum_0 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 0]), abs_mask);
accum_1 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 4]), abs_mask);
accum_2 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 8]), abs_mask);
accum_3 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+12]), abs_mask);
accum_0 += (__m256d)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 0]), abs_mask);
accum_1 += (__m256d)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 4]), abs_mask);
accum_2 += (__m256d)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 8]), abs_mask);
accum_3 += (__m256d)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+12]), abs_mask);
}
accum_0 = accum_0 + accum_1 + accum_2 + accum_3;
@ -63,10 +63,10 @@ static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1)
__m128i abs_mask2 = _mm_set1_epi64x(0x7fffffffffffffff);
for (i = tail_index_AVX2; i < tail_index_SSE; i += 8) {
accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2);
accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 2]), abs_mask2);
accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2);
accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 6]), abs_mask2);
accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 0]), abs_mask2);
accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 2]), abs_mask2);
accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 4]), abs_mask2);
accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 6]), abs_mask2);
}
accum_20 = accum_20 + accum_21 + accum_22 + accum_23;

View File

@ -58,10 +58,10 @@ static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1)
__m128i abs_mask2 = _mm_set1_epi64x(0x7fffffffffffffff);
for (i = tail_index_AVX512; i < tail_index_SSE; i += 8) {
accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2);
accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 2]), abs_mask2);
accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2);
accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 6]), abs_mask2);
accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 0]), abs_mask2);
accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 2]), abs_mask2);
accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 4]), abs_mask2);
accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 6]), abs_mask2);
}
accum_20 = accum_20 + accum_21 + accum_22 + accum_23;

View File

@ -38,10 +38,10 @@ static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1)
__m256i abs_mask = _mm256_set1_epi32(0x7fffffff);
for (i = 0; i < tail_index_AVX2; i += 32) {
accum_0 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+ 0]), abs_mask);
accum_1 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+ 8]), abs_mask);
accum_2 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+16]), abs_mask);
accum_3 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+24]), abs_mask);
accum_0 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 0]), abs_mask);
accum_1 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 8]), abs_mask);
accum_2 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+16]), abs_mask);
accum_3 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+24]), abs_mask);
}
accum_0 = accum_0 + accum_1 + accum_2 + accum_3;
@ -62,8 +62,8 @@ static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1)
__m128i abs_mask2 = _mm_set1_epi32(0x7fffffff);
for (i = tail_index_AVX2; i < tail_index_SSE; i += 8) {
accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2);
accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2);
accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 0]), abs_mask2);
accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 4]), abs_mask2);
}
accum_20 += accum_21;

View File

@ -53,8 +53,8 @@ static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1)
__m128i abs_mask2 = _mm_set1_epi32(0x7fffffff);
for (i = tail_index_AVX512; i < tail_index_SSE; i += 8) {
accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2);
accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2);
accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 0]), abs_mask2);
accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 4]), abs_mask2);
}
accum_20 += accum_21;

View File

@ -79,21 +79,21 @@ static float sbdot_accl_kernel(BLASLONG n, bfloat16 *x, bfloat16 *y)
__m256 accum256_1 = _mm256_setzero_ps();
int tail_index_32 = n&(~31);
for (int j = 0; j < tail_index_32; j += 32) {
accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256(&x[j+ 0]), (__m256bh) _mm256_loadu_si256(&y[j+ 0]));
accum256_1 = _mm256_dpbf16_ps(accum256_1, (__m256bh) _mm256_loadu_si256(&x[j+16]), (__m256bh) _mm256_loadu_si256(&y[j+16]));
accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256((__m256i *)&x[j+ 0]), (__m256bh) _mm256_loadu_si256((__m256i *)&y[j+ 0]));
accum256_1 = _mm256_dpbf16_ps(accum256_1, (__m256bh) _mm256_loadu_si256((__m256i *)&x[j+16]), (__m256bh) _mm256_loadu_si256((__m256i *)&y[j+16]));
}
accum256 = _mm256_add_ps(accum256, accum256_1);
/* Processing the remaining <32 chunk with 16-elements processing */
if ((n&16) != 0) {
accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256(&x[tail_index_32]), (__m256bh) _mm256_loadu_si256(&y[tail_index_32]));
accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256((__m256i *)&x[tail_index_32]), (__m256bh) _mm256_loadu_si256((__m256i *)&y[tail_index_32]));
}
accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf128_ps(accum256, 1));
/* Processing the remaining <16 chunk with 8-elements processing */
if ((n&8) != 0) {
int tail_index_16 = n&(~15);
accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128(&x[tail_index_16]), (__m128bh) _mm_loadu_si128(&y[tail_index_16]));
accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128((__m128i *)&x[tail_index_16]), (__m128bh) _mm_loadu_si128((__m128i *)&y[tail_index_16]));
}
/* Processing the remaining <8 chunk with masked 8-elements processing */
@ -108,13 +108,13 @@ static float sbdot_accl_kernel(BLASLONG n, bfloat16 *x, bfloat16 *y)
} else if (n > 15) { /* n range from 16 to 31 */
/* Processing <32 chunk with 16-elements processing */
__m256 accum256 = _mm256_setzero_ps();
accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256(&x[0]), (__m256bh) _mm256_loadu_si256(&y[0]));
accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256((__m256i *)&x[0]), (__m256bh) _mm256_loadu_si256((__m256i *)&y[0]));
accum128 += _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf128_ps(accum256, 1));
/* Processing the remaining <16 chunk with 8-elements processing */
if ((n&8) != 0) {
int tail_index_16 = n&(~15);
accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128(&x[tail_index_16]), (__m128bh) _mm_loadu_si128(&y[tail_index_16]));
accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128((__m128i *)&x[tail_index_16]), (__m128bh) _mm_loadu_si128((__m128i *)&y[tail_index_16]));
}
/* Processing the remaining <8 chunk with masked 8-elements processing */
@ -128,7 +128,7 @@ static float sbdot_accl_kernel(BLASLONG n, bfloat16 *x, bfloat16 *y)
}
} else if (n > 7) { /* n range from 8 to 15 */
/* Processing <16 chunk with 8-elements processing */
accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128(&x[0]), (__m128bh) _mm_loadu_si128(&y[0]));
accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128((__m128i *)&x[0]), (__m128bh) _mm_loadu_si128((__m128i *)&y[0]));
/* Processing the remaining <8 chunk with masked 8-elements processing */
if ((n&7) != 0) {

View File

@ -1246,7 +1246,7 @@ void COL_MAJOR_ITCOPY_KERNEL_Kx16(BLASLONG k, bfloat16 * A, BLASLONG lda, bfloat
// K=Any number but will be processed based on 32, M<=16
void COL_MAJOR_ITCOPY_KERNEL_Kx16m(BLASLONG m, BLASLONG k, bfloat16 * A, BLASLONG lda, bfloat16 * block_A)
{
bfloat16 * src_addr0, * src_addr1, * src_addr2, * src_addr3;
bfloat16 * src_addr0;
bfloat16 * dst_addr0, * dst_addr1;
BLASLONG tag_k_32x = k & (~31);

View File

@ -0,0 +1,499 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <immintrin.h>
#include "common.h"
#define VMOVLDUP(addr, zmm) asm("vmovsldup (%1), %0": "=v"(zmm): "r"(addr))
#define VMOVHDUP(addr, zmm) asm("vmovshdup (%1), %0": "=v"(zmm): "r"(addr))
#define BROADCAST64(base, step, n, offset, zmm) \
if (n == 0) asm("vbroadcastsd %c2(%1), %0": "=v"(zmm): "r"(base), "n"(offset*2)); \
else asm("vbroadcastsd %c4(%1, %2, %c3), %0": "=v"(zmm): "r"(base), "r"(step), "n"(n*2), "n"(offset*2))
#define DECLARE_A_PAIR(A) \
__m512i A_lo_##A; __m512i A_hi_##A;
#define LOAD_A_PAIR(A) \
VMOVLDUP(ptr_a##A, A_lo_##A); \
VMOVHDUP(ptr_a##A, A_hi_##A);
#define MASK_LOAD_A_PAIR(A) { \
__m512 tmp = _mm512_maskz_loadu_ps(mmask, ptr_a##A); \
A_lo_##A = (__m512i) _mm512_moveldup_ps(tmp); \
A_hi_##A = (__m512i) _mm512_movehdup_ps(tmp); \
}
#define LOAD_A_PAIR_TAIL(A) { \
__m256i ymm = _mm256_loadu_si256((void *)ptr_a##A); \
__m512 zmm = (__m512) _mm512_cvtepu16_epi32(ymm); \
A_lo_##A = (__m512i) _mm512_moveldup_ps(zmm); \
A_hi_##A = (__m512i) _mm512_movehdup_ps(zmm); \
}
#define MASK_LOAD_A_PAIR_TAIL(A) { \
__m256i ymm = _mm256_maskz_loadu_epi16(mmask, ptr_a##A); \
__m512 zmm = (__m512) _mm512_cvtepu16_epi32(ymm); \
A_lo_##A = (__m512i) _mm512_moveldup_ps(zmm); \
A_hi_##A = (__m512i) _mm512_movehdup_ps(zmm); \
}
#define DECLARE_B_PAIR() \
__m512i B_lo; __m512i B_hi;
#define PREFETCH_B_STEP 32
#define PREFETCH_B(Bx, By) \
if (By == 0) asm("prefetcht0 %c1(%0)": : "r"(ptr_b##Bx), "n"(PREFETCH_B_STEP * 2)); \
else asm("prefetcht0 %c3(%0, %1, %c2)": : "r"(ptr_b##Bx), "r"(n_blksize), "n"(By*2), "n"(PREFETCH_B_STEP * 2))
#define BROADCAST_B_PAIR(Bx, By) \
BROADCAST64(ptr_b##Bx, n_blksize, By, 0, B_lo); \
BROADCAST64(ptr_b##Bx, n_blksize, By, 4, B_hi);
#define MASK_BROADCAST_B_PAIR(Bx, x) {\
__m128 xmm = _mm_maskz_loadu_ps(nmask, ptr_b##Bx); \
B_lo = (__m512i) _mm512_broadcastsd_pd((__m128d) xmm); \
B_hi = (__m512i) _mm512_broadcastsd_pd(_mm_permute_pd((__m128d) xmm, 0x1)); \
}
#define BROADCAST_B_PAIR_TAIL(Bx, By) {\
__m128i xmm = (__m128i) _mm_load_sd((double *)(ptr_b##Bx + n_blksize * By)); \
xmm = _mm_cvtepu16_epi32(xmm); \
B_lo = _mm512_broadcast_i32x2(xmm); \
B_hi = _mm512_broadcast_i32x2((__m128i) _mm_permute_pd((__m128d) xmm, 0x1)); \
}
#define MASK_BROADCAST_B_PAIR_TAIL(Bx, By) {\
__m128i xmm = _mm_maskz_loadu_epi16(nmask, ptr_b##Bx + n_blksize * By); \
xmm = _mm_cvtepu16_epi32(xmm); \
B_lo = _mm512_broadcast_i32x2(xmm); \
B_hi = _mm512_broadcast_i32x2((__m128i) _mm_permute_pd((__m128d) xmm, 0x1)); \
}
#define DECLARE_RESULT_4X(A, Bx, By) \
__m512 result_00_##A##Bx##By = _mm512_setzero_ps(); \
__m512 result_01_##A##Bx##By = _mm512_setzero_ps(); \
__m512 result_10_##A##Bx##By = _mm512_setzero_ps(); \
__m512 result_11_##A##Bx##By = _mm512_setzero_ps();
#define FMA(a, b, r) r = _mm512_dpbf16_ps(r, (__m512bh)a, (__m512bh)b)
#define MATMUL_4X(A, Bx, By) \
FMA(A_lo_##A, B_lo, result_00_##A##Bx##By); \
FMA(A_hi_##A, B_lo, result_01_##A##Bx##By); \
FMA(A_lo_##A, B_hi, result_10_##A##Bx##By); \
FMA(A_hi_##A, B_hi, result_11_##A##Bx##By);
#define _STORE_C_2nx16(addr, val0, val1) \
asm("vfmadd213ps (%1), %2, %0": "+v"(val0) : "r"(addr), "v"(alpha_512)); \
asm("vfmadd213ps (%1, %3, 4), %2, %0": "+v"(val1) : "r"(addr), "v"(alpha_512), "r"(ldc)); \
asm("vmovups %0, (%1)": : "v"(val0), "r"(addr)); \
asm("vmovups %0, (%1, %2, 4)": : "v"(val1), "r"(addr), "r"(ldc))
#define _MASK_STORE_C_2nx16(addr, val0, val1) \
asm("vfmadd213ps (%1), %2, %0 %{%3%} ": "+v"(val0) : "r"(addr), "v"(alpha_512), "k"(mmask)); \
asm("vfmadd213ps (%1, %3, 4), %2, %0 %{%4%}": "+v"(val1) : "r"(addr), "v"(alpha_512), "r"(ldc), "k"(mmask)); \
asm("vmovups %0, (%1) %{%2%}": : "v"(val0), "r"(addr), "k"(mmask)); \
asm("vmovups %0, (%1, %2, 4) %{%3%}": : "v"(val1), "r"(addr), "r"(ldc), "k"(mmask))
#define _REORDER_C_2X(result_0, result_1) { \
__m512 tmp0, tmp1; \
tmp0 = _mm512_unpacklo_ps(result_0, result_1); \
tmp1 = _mm512_unpackhi_ps(result_0, result_1); \
result_0 = (__m512) _mm512_unpacklo_pd((__m512d) tmp0, (__m512d) tmp1); \
result_1 = (__m512) _mm512_unpackhi_pd((__m512d) tmp0, (__m512d) tmp1); \
}
#define _STORE_2X(ptr_c, result_0, result_1) {\
_REORDER_C_2X(result_0, result_1) \
_STORE_C_2nx16(ptr_c, result_0, result_1); \
ptr_c += ldc * 2; \
}
#define _MASK_STORE_2X(ptr_c, result_0, result_1) {\
_REORDER_C_2X(result_0, result_1) \
_MASK_STORE_C_2nx16(ptr_c, result_0, result_1); \
ptr_c += ldc * 2; \
}
#define STORE_4X(A, Bx, By) { \
_STORE_2X(ptr_c##A, result_00_##A##Bx##By, result_01_##A##Bx##By); \
_STORE_2X(ptr_c##A, result_10_##A##Bx##By, result_11_##A##Bx##By); \
}
#define MASK_STORE_4X(A, Bx, By) { \
_MASK_STORE_2X(ptr_c##A, result_00_##A##Bx##By, result_01_##A##Bx##By); \
_MASK_STORE_2X(ptr_c##A, result_10_##A##Bx##By, result_11_##A##Bx##By); \
}
#define _STORE_C_16(addr, val0) \
asm("vfmadd213ps (%1), %2, %0": "+v"(val0) : "r"(addr), "v"(alpha_512)); \
asm("vmovups %0, (%1)": : "v"(val0), "r"(addr));
#define _MASK_STORE_C_16(addr, val0) \
asm("vfmadd213ps (%1), %2, %0 %{%3%} ": "+v"(val0) : "r"(addr), "v"(alpha_512), "k"(mmask)); \
asm("vmovups %0, (%1) %{%2%}": : "v"(val0), "r"(addr), "k"(mmask));
#define N_STORE_4X(A, Bx, By) { \
_REORDER_C_2X(result_00_##A##Bx##By, result_01_##A##Bx##By); \
_REORDER_C_2X(result_10_##A##Bx##By, result_11_##A##Bx##By); \
switch(n_count) { \
case 3: _STORE_C_16(ptr_c + ldc * 2, result_10_##A##Bx##By); \
case 2: _STORE_C_16(ptr_c + ldc * 1, result_01_##A##Bx##By); \
case 1: _STORE_C_16(ptr_c + ldc * 0, result_00_##A##Bx##By); \
} \
ptr_c##A += ldc * n_count; \
}
#define N_MASK_STORE_4X(A, Bx, By) { \
_REORDER_C_2X(result_00_##A##Bx##By, result_01_##A##Bx##By); \
_REORDER_C_2X(result_10_##A##Bx##By, result_11_##A##Bx##By); \
switch(n_count) { \
case 3: _MASK_STORE_C_16(ptr_c + ldc * 2, result_10_##A##Bx##By); \
case 2: _MASK_STORE_C_16(ptr_c + ldc * 1, result_01_##A##Bx##By); \
case 1: _MASK_STORE_C_16(ptr_c + ldc * 0, result_00_##A##Bx##By); \
} \
ptr_c##A += ldc * n_count; \
}
int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc)
{
IFLOAT *ptr_a = A, *ptr_b = B;
IFLOAT *ptr_b0, *ptr_b1;
IFLOAT *ptr_a0, *ptr_a1;
FLOAT *ptr_c = C;
FLOAT *ptr_c0, *ptr_c1;
BLASLONG n_count = n;
BLASLONG m_count, k_count;
BLASLONG n_blksize = 4 * k;
BLASLONG cn_offset = 0;
__m512 alpha_512 = _mm512_broadcastss_ps(_mm_load_ss(&alpha));
for (; n_count > 23; n_count -= 24) {
IFLOAT *ptr_b00 = ptr_b;
IFLOAT *ptr_b10 = ptr_b + n_blksize * 3;
ptr_a0 = ptr_a;
ptr_c = C + cn_offset * ldc;
m_count = m;
for (; m_count > 15; m_count -= 16) {
ptr_b0 = ptr_b00;
ptr_b1 = ptr_b10;
DECLARE_A_PAIR(0);
DECLARE_B_PAIR();
DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2);
DECLARE_RESULT_4X(0, 1, 0); DECLARE_RESULT_4X(0, 1, 1); DECLARE_RESULT_4X(0, 1, 2);
k_count = k;
for (; k_count > 3; k_count -=4) {
LOAD_A_PAIR(0);
_mm_prefetch(ptr_a0 + 128, _MM_HINT_T0);
ptr_a0 += 16 * 2;
BROADCAST_B_PAIR(0, 0); PREFETCH_B(0, 0); MATMUL_4X(0, 0, 0);
BROADCAST_B_PAIR(0, 1); PREFETCH_B(0, 1); MATMUL_4X(0, 0, 1);
BROADCAST_B_PAIR(0, 2); PREFETCH_B(0, 2); MATMUL_4X(0, 0, 2);
ptr_b0 += 4 * 2;
BROADCAST_B_PAIR(1, 0); PREFETCH_B(1, 0); MATMUL_4X(0, 1, 0);
BROADCAST_B_PAIR(1, 1); PREFETCH_B(1, 1); MATMUL_4X(0, 1, 1);
BROADCAST_B_PAIR(1, 2); PREFETCH_B(1, 2); MATMUL_4X(0, 1, 2);
ptr_b1 += 4 * 2;
LOAD_A_PAIR(0);
_mm_prefetch(ptr_a0 + 128, _MM_HINT_T0);
ptr_a0 += 16 * 2;
BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0);
BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1);
BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2);
ptr_b0 += 4 * 2;
BROADCAST_B_PAIR(1, 0); MATMUL_4X(0, 1, 0);
BROADCAST_B_PAIR(1, 1); MATMUL_4X(0, 1, 1);
BROADCAST_B_PAIR(1, 2); MATMUL_4X(0, 1, 2);
ptr_b1 += 4 * 2;
}
for (; k_count > 1; k_count -=2) {
LOAD_A_PAIR(0);
ptr_a0 += 16 * 2;
BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0);
BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1);
BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2);
ptr_b0 += 4 * 2;
BROADCAST_B_PAIR(1, 0); MATMUL_4X(0, 1, 0);
BROADCAST_B_PAIR(1, 1); MATMUL_4X(0, 1, 1);
BROADCAST_B_PAIR(1, 2); MATMUL_4X(0, 1, 2);
ptr_b1 += 4 * 2;
}
if (k_count > 0) {
LOAD_A_PAIR_TAIL(0);
ptr_a0 += 16;
BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0);
BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1);
BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2);
ptr_b0 += 4;
BROADCAST_B_PAIR_TAIL(1, 0); MATMUL_4X(0, 1, 0);
BROADCAST_B_PAIR_TAIL(1, 1); MATMUL_4X(0, 1, 1);
BROADCAST_B_PAIR_TAIL(1, 2); MATMUL_4X(0, 1, 2);
ptr_b1 += 4;
}
ptr_c0 = ptr_c;
STORE_4X(0, 0, 0); STORE_4X(0, 0, 1); STORE_4X(0, 0, 2);
STORE_4X(0, 1, 0); STORE_4X(0, 1, 1); STORE_4X(0, 1, 2);
ptr_c += 16;
}
if (m_count > 0) {
__mmask16 mmask = (1UL << m_count) - 1;
ptr_b0 = ptr_b00;
ptr_b1 = ptr_b10;
DECLARE_A_PAIR(0);
DECLARE_B_PAIR();
DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2);
DECLARE_RESULT_4X(0, 1, 0); DECLARE_RESULT_4X(0, 1, 1); DECLARE_RESULT_4X(0, 1, 2);
for (k_count = k; k_count > 1; k_count -=2) {
MASK_LOAD_A_PAIR(0);
ptr_a0 += m_count * 2;
BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0);
BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1);
BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2);
ptr_b0 += 4 * 2;
BROADCAST_B_PAIR(1, 0); MATMUL_4X(0, 1, 0);
BROADCAST_B_PAIR(1, 1); MATMUL_4X(0, 1, 1);
BROADCAST_B_PAIR(1, 2); MATMUL_4X(0, 1, 2);
ptr_b1 += 4 * 2;
}
if (k_count > 0) {
MASK_LOAD_A_PAIR_TAIL(0);
ptr_a0 += m_count;
BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0);
BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1);
BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2);
ptr_b0 += 4;
BROADCAST_B_PAIR_TAIL(1, 0); MATMUL_4X(0, 1, 0);
BROADCAST_B_PAIR_TAIL(1, 1); MATMUL_4X(0, 1, 1);
BROADCAST_B_PAIR_TAIL(1, 2); MATMUL_4X(0, 1, 2);
ptr_b1 += 4;
}
ptr_c0 = ptr_c;
MASK_STORE_4X(0, 0, 0); MASK_STORE_4X(0, 0, 1); MASK_STORE_4X(0, 0, 2);
MASK_STORE_4X(0, 1, 0); MASK_STORE_4X(0, 1, 1); MASK_STORE_4X(0, 1, 2);
ptr_c += m_count;
}
ptr_b += 24 * k;
cn_offset += 24;
}
for (; n_count > 11; n_count -= 12) {
IFLOAT *ptr_b00 = ptr_b;
ptr_a0 = ptr_a;
ptr_a1 = ptr_a + 16 * k;
ptr_c = C + cn_offset * ldc;
m_count = m;
for (; m_count > 31; m_count -= 32) {
ptr_b0 = ptr_b00;
DECLARE_A_PAIR(0); DECLARE_A_PAIR(1);
DECLARE_B_PAIR();
DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2);
DECLARE_RESULT_4X(1, 0, 0); DECLARE_RESULT_4X(1, 0, 1); DECLARE_RESULT_4X(1, 0, 2);
for (k_count = k; k_count > 1; k_count -=2) {
LOAD_A_PAIR(0); LOAD_A_PAIR(1);
ptr_a0 += 16 * 2;
ptr_a1 += 16 * 2;
BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); MATMUL_4X(1, 0, 0);
BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); MATMUL_4X(1, 0, 1);
BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); MATMUL_4X(1, 0, 2);
ptr_b0 += 4 * 2;
}
if (k_count > 0) {
LOAD_A_PAIR_TAIL(0); LOAD_A_PAIR_TAIL(1);
ptr_a0 += 16;
ptr_a1 += 16;
BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); MATMUL_4X(1, 0, 0);
BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1); MATMUL_4X(1, 0, 1);
BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2); MATMUL_4X(1, 0, 2);
ptr_b0 += 4;
}
ptr_c0 = ptr_c;
ptr_c1 = ptr_c + 16;
STORE_4X(0, 0, 0); STORE_4X(1, 0, 0);
STORE_4X(0, 0, 1); STORE_4X(1, 0, 1);
STORE_4X(0, 0, 2); STORE_4X(1, 0, 2);
ptr_c += 16 * 2;
ptr_a0 = ptr_a1;
ptr_a1 = ptr_a0 + 16 * k;
}
for (; m_count > 15; m_count -= 16) {
ptr_b0 = ptr_b00;
DECLARE_A_PAIR(0);
DECLARE_B_PAIR();
DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2);
for (k_count = k; k_count > 1; k_count -=2) {
LOAD_A_PAIR(0);
ptr_a0 += 16 * 2;
BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0);
BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1);
BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2);
ptr_b0 += 4 * 2;
}
if (k_count > 0) {
LOAD_A_PAIR_TAIL(0);
ptr_a0 += 16;
BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0);
BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1);
BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2);
ptr_b0 += 4;
}
ptr_c0 = ptr_c;
STORE_4X(0, 0, 0); STORE_4X(0, 0, 1); STORE_4X(0, 0, 2);
ptr_c += 16;
}
if (m_count > 0) {
__mmask16 mmask = (1UL << m_count) - 1;
ptr_b0 = ptr_b00;
DECLARE_A_PAIR(0);
DECLARE_B_PAIR();
DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2);
for (k_count = k; k_count > 1; k_count -=2) {
MASK_LOAD_A_PAIR(0);
ptr_a0 += m_count * 2;
BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0);
BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1);
BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2);
ptr_b0 += 4 * 2;
}
if (k_count > 0) {
MASK_LOAD_A_PAIR_TAIL(0);
ptr_a0 += m_count;
BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0);
BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1);
BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2);
ptr_b0 += 4;
}
ptr_c0 = ptr_c;
MASK_STORE_4X(0, 0, 0); MASK_STORE_4X(0, 0, 1); MASK_STORE_4X(0, 0, 2);
ptr_c += m_count;
}
ptr_b += 12 * k;
cn_offset += 12;
}
for (; n_count > 3; n_count -= 4) {
IFLOAT *ptr_b00 = ptr_b;
ptr_a0 = ptr_a;
ptr_c = C + cn_offset * ldc;
m_count = m;
for (; m_count > 15; m_count -= 16) {
ptr_b0 = ptr_b00;
DECLARE_A_PAIR(0);
DECLARE_B_PAIR();
DECLARE_RESULT_4X(0, 0, 0);
for (k_count = k; k_count > 1; k_count -=2) {
LOAD_A_PAIR(0);
BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0);
ptr_b0 += 4 * 2;
ptr_a0 += 16 * 2;
}
if (k_count > 0) {
LOAD_A_PAIR_TAIL(0);
BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0);
ptr_b0 += 4;
ptr_a0 += 16;
}
ptr_c0 = ptr_c;
STORE_4X(0, 0, 0);
ptr_c += 16;
}
if (m_count > 0) {
__mmask16 mmask = (1UL << m_count) - 1;
ptr_b0 = ptr_b00;
DECLARE_A_PAIR(0);
DECLARE_B_PAIR();
DECLARE_RESULT_4X(0, 0, 0);
for (k_count = k; k_count > 1; k_count -=2) {
MASK_LOAD_A_PAIR(0);
BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0);
ptr_b0 += 4 * 2;
ptr_a0 += m_count * 2;
}
if (k_count > 0) {
MASK_LOAD_A_PAIR_TAIL(0);
BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0);
ptr_b0 += 4;
ptr_a0 += m_count;
}
ptr_c0 = ptr_c;
MASK_STORE_4X(0, 0, 0);
ptr_c += m_count;
}
ptr_b += 4 * k;
cn_offset += 4;
}
if (n_count > 0) {
__mmask8 nmask = (1UL << n_count) - 1;
IFLOAT *ptr_b00 = ptr_b;
ptr_a0 = ptr_a;
ptr_c = C + cn_offset * ldc;
m_count = m;
for (; m_count > 15; m_count -= 16) {
ptr_b0 = ptr_b00;
DECLARE_A_PAIR(0);
DECLARE_B_PAIR();
DECLARE_RESULT_4X(0, 0, 0);
for (k_count = k; k_count > 1; k_count -=2) {
LOAD_A_PAIR(0);
MASK_BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0);
ptr_b0 += n_count * 2;
ptr_a0 += 16 * 2;
}
if (k_count > 0) {
LOAD_A_PAIR_TAIL(0);
MASK_BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0);
ptr_b0 += n_count;
ptr_a0 += 16;
}
ptr_c0 = ptr_c;
N_STORE_4X(0, 0, 0);
ptr_c += 16;
}
if (m_count > 0) {
__mmask16 mmask = (1UL << m_count) - 1;
ptr_b0 = ptr_b00;
DECLARE_A_PAIR(0);
DECLARE_B_PAIR();
DECLARE_RESULT_4X(0, 0, 0);
for (k_count = k; k_count > 1; k_count -=2) {
MASK_LOAD_A_PAIR(0);
MASK_BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0);
ptr_b0 += n_count * 2;
ptr_a0 += m_count * 2;
}
if (k_count > 0) {
MASK_LOAD_A_PAIR_TAIL(0);
MASK_BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0);
ptr_b0 += n_count;
ptr_a0 += m_count;
}
ptr_c0 = ptr_c;
N_MASK_STORE_4X(0, 0, 0);
ptr_c += m_count;
}
}
return 0;
}

View File

@ -0,0 +1,353 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <immintrin.h>
#include "common.h"
#define _MM512_SHUFFLE_i32(result, in1, in2, imm8) \
asm("vshufps %3, %2, %1, %0": "=v"(result): "v"(in1), "v"(in2), "N"(imm8))
#define REORDER_8x32(t0, t1, t2, t3, t4, t5, t6, t7) { \
__m512i v; \
t0 = _mm512_unpacklo_epi32(r0, r1); \
t1 = _mm512_unpackhi_epi32(r0, r1); \
t2 = _mm512_unpacklo_epi32(r2, r3); \
t3 = _mm512_unpackhi_epi32(r2, r3); \
t4 = _mm512_unpacklo_epi32(r4, r5); \
t5 = _mm512_unpackhi_epi32(r4, r5); \
t6 = _mm512_unpacklo_epi32(r6, r7); \
t7 = _mm512_unpackhi_epi32(r6, r7); \
_MM512_SHUFFLE_i32(v, t0, t2, 0x4E); \
r0 = _mm512_mask_blend_epi32(kc, t0, v); \
r1 = _mm512_mask_blend_epi32(k3, t2, v); \
_MM512_SHUFFLE_i32(v, t1, t3, 0x4E); \
r2 = _mm512_mask_blend_epi32(kc, t1, v); \
r3 = _mm512_mask_blend_epi32(k3, t3, v); \
_MM512_SHUFFLE_i32(v, t4, t6, 0x4E); \
r4 = _mm512_mask_blend_epi32(kc, t4, v); \
r5 = _mm512_mask_blend_epi32(k3, t6, v); \
_MM512_SHUFFLE_i32(v, t5, t7, 0x4E); \
r6 = _mm512_mask_blend_epi32(kc, t5, v); \
r7 = _mm512_mask_blend_epi32(k3, t7, v); \
t0 = _mm512_permutex2var_epi32(r0, idx_lo, r4); \
t1 = _mm512_permutex2var_epi32(r1, idx_lo, r5); \
t2 = _mm512_permutex2var_epi32(r2, idx_lo, r6); \
t3 = _mm512_permutex2var_epi32(r3, idx_lo, r7); \
t4 = _mm512_permutex2var_epi32(r0, idx_hi, r4); \
t5 = _mm512_permutex2var_epi32(r1, idx_hi, r5); \
t6 = _mm512_permutex2var_epi32(r2, idx_hi, r6); \
t7 = _mm512_permutex2var_epi32(r3, idx_hi, r7); \
}
#define STORE_512_LO(x) \
v = _mm512_permutex2var_epi64(t0##x, idx_lo2, t1##x); \
_mm512_storeu_si512(boffset0 + x*32, v);
#define STORE_512_HI(x) \
v = _mm512_permutex2var_epi64(t0##x, idx_hi2, t1##x); \
_mm512_storeu_si512(boffset0 + (x + 8)*32, v);
#define MASK_STORE_512_LO(x) \
v = _mm512_permutex2var_epi64(t0##x, idx_lo2, t1##x); \
_mm512_mask_storeu_epi32(boffset0 + 2*x*remain_n, nmask, v);
#define MASK_STORE_512_HI(x) \
v = _mm512_permutex2var_epi64(t0##x, idx_hi2, t1##x); \
_mm512_mask_storeu_epi32(boffset0 + 2*(x + 8)*remain_n, nmask, v);
#define STORE_512(x, y) {\
__m512i v; \
if (x == 0) { STORE_512_LO(y); } \
else { STORE_512_HI(y); } \
}
#define MASK_STORE_512(x, y) {\
__m512i v; \
if (x == 0) { MASK_STORE_512_LO(y); } \
else { MASK_STORE_512_HI(y); } \
}
#define SET_TAIL(y, x) {\
if (y == 0) tail = _mm512_permutex2var_epi64(t0##x, idx_lo2, t1##x); \
else tail = _mm512_permutex2var_epi64(t0##x, idx_hi2, t1##x); \
}
#define GET_TAIL() \
switch (n_store + 1) { \
case 16: SET_TAIL(1, 7); break; \
case 15: SET_TAIL(1, 6); break; \
case 14: SET_TAIL(1, 5); break; \
case 13: SET_TAIL(1, 4); break; \
case 12: SET_TAIL(1, 3); break; \
case 11: SET_TAIL(1, 2); break; \
case 10: SET_TAIL(1, 1); break; \
case 9: SET_TAIL(1, 0); break; \
case 8: SET_TAIL(0, 7); break; \
case 7: SET_TAIL(0, 6); break; \
case 6: SET_TAIL(0, 5); break; \
case 5: SET_TAIL(0, 4); break; \
case 4: SET_TAIL(0, 3); break; \
case 3: SET_TAIL(0, 2); break; \
case 2: SET_TAIL(0, 1); break; \
case 1: SET_TAIL(0, 0); break; \
}
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
BLASLONG i, j;
IFLOAT *boffset0;
IFLOAT *aoffset;
IFLOAT *aoffset00, *aoffset01, *aoffset02, *aoffset03, *aoffset04, *aoffset05, *aoffset06, *aoffset07;
IFLOAT *aoffset10, *aoffset11, *aoffset12, *aoffset13, *aoffset14, *aoffset15, *aoffset16, *aoffset17;
aoffset = a;
boffset0 = b;
BLASLONG n16 = n & ~15;
BLASLONG m32 = m & ~31;
int permute_table[] = {
0x0, 0x1, 0x2, 0x3, 0x10, 0x11, 0x12, 0x13, 0x8, 0x9, 0xa, 0xb, 0x18, 0x19, 0x1a, 0x1b,
0x4, 0x5, 0x6, 0x7, 0x14, 0x15, 0x16, 0x17, 0xc, 0xd, 0xe, 0xf, 0x1c, 0x1d, 0x1e, 0x1f,
};
u_int64_t permute_table2[] = {
0x00, 0x01, 0x02, 0x03, 8|0x0, 8|0x1, 8|0x2, 8|0x3,
0x04, 0x05, 0x06, 0x07, 8|0x4, 8|0x5, 8|0x6, 8|0x7,
};
__m512i idx_lo = _mm512_loadu_si512(permute_table);
__m512i idx_hi = _mm512_loadu_si512(permute_table + 16);
__m512i idx_lo2 = _mm512_loadu_si512(permute_table2);
__m512i idx_hi2 = _mm512_loadu_si512(permute_table2 + 8);
__mmask16 kc = 0xcccc;
__mmask16 k3 = 0x3333;
__m512i r0, r1, r2, r3, r4, r5, r6, r7;
__m512i t00, t01, t02, t03, t04, t05, t06, t07;
__m512i t10, t11, t12, t13, t14, t15, t16, t17;
for (j = 0; j < n16; j += 16) {
aoffset00 = aoffset;
aoffset01 = aoffset00 + lda;
aoffset02 = aoffset01 + lda;
aoffset03 = aoffset02 + lda;
aoffset04 = aoffset03 + lda;
aoffset05 = aoffset04 + lda;
aoffset06 = aoffset05 + lda;
aoffset07 = aoffset06 + lda;
aoffset10 = aoffset07 + lda;
aoffset11 = aoffset10 + lda;
aoffset12 = aoffset11 + lda;
aoffset13 = aoffset12 + lda;
aoffset14 = aoffset13 + lda;
aoffset15 = aoffset14 + lda;
aoffset16 = aoffset15 + lda;
aoffset17 = aoffset16 + lda;
aoffset += 16 * lda;
for (i = 0; i < m32; i += 32) {
r0 = _mm512_loadu_si512(aoffset00 + i);
r1 = _mm512_loadu_si512(aoffset01 + i);
r2 = _mm512_loadu_si512(aoffset02 + i);
r3 = _mm512_loadu_si512(aoffset03 + i);
r4 = _mm512_loadu_si512(aoffset04 + i);
r5 = _mm512_loadu_si512(aoffset05 + i);
r6 = _mm512_loadu_si512(aoffset06 + i);
r7 = _mm512_loadu_si512(aoffset07 + i);
REORDER_8x32(t00, t01, t02, t03, t04, t05, t06, t07);
r0 = _mm512_loadu_si512(aoffset10 + i);
r1 = _mm512_loadu_si512(aoffset11 + i);
r2 = _mm512_loadu_si512(aoffset12 + i);
r3 = _mm512_loadu_si512(aoffset13 + i);
r4 = _mm512_loadu_si512(aoffset14 + i);
r5 = _mm512_loadu_si512(aoffset15 + i);
r6 = _mm512_loadu_si512(aoffset16 + i);
r7 = _mm512_loadu_si512(aoffset17 + i);
REORDER_8x32(t10, t11, t12, t13, t14, t15, t16, t17);
STORE_512(0, 0); STORE_512(0, 1); STORE_512(0, 2); STORE_512(0, 3);
STORE_512(0, 4); STORE_512(0, 5); STORE_512(0, 6); STORE_512(0, 7);
STORE_512(1, 0); STORE_512(1, 1); STORE_512(1, 2); STORE_512(1, 3);
STORE_512(1, 4); STORE_512(1, 5); STORE_512(1, 6); STORE_512(1, 7);
boffset0 += 16 * 32;
}
if (i < m) {
int remain_m = m - i;
__mmask32 mmask = (1UL << remain_m) - 1;
r0 = _mm512_maskz_loadu_epi16(mmask, aoffset00 + i);
r1 = _mm512_maskz_loadu_epi16(mmask, aoffset01 + i);
r2 = _mm512_maskz_loadu_epi16(mmask, aoffset02 + i);
r3 = _mm512_maskz_loadu_epi16(mmask, aoffset03 + i);
r4 = _mm512_maskz_loadu_epi16(mmask, aoffset04 + i);
r5 = _mm512_maskz_loadu_epi16(mmask, aoffset05 + i);
r6 = _mm512_maskz_loadu_epi16(mmask, aoffset06 + i);
r7 = _mm512_maskz_loadu_epi16(mmask, aoffset07 + i);
REORDER_8x32(t00, t01, t02, t03, t04, t05, t06, t07);
r0 = _mm512_maskz_loadu_epi16(mmask, aoffset10 + i);
r1 = _mm512_maskz_loadu_epi16(mmask, aoffset11 + i);
r2 = _mm512_maskz_loadu_epi16(mmask, aoffset12 + i);
r3 = _mm512_maskz_loadu_epi16(mmask, aoffset13 + i);
r4 = _mm512_maskz_loadu_epi16(mmask, aoffset14 + i);
r5 = _mm512_maskz_loadu_epi16(mmask, aoffset15 + i);
r6 = _mm512_maskz_loadu_epi16(mmask, aoffset16 + i);
r7 = _mm512_maskz_loadu_epi16(mmask, aoffset17 + i);
REORDER_8x32(t10, t11, t12, t13, t14, t15, t16, t17);
int n_store = remain_m/2;
switch (n_store) {
case 15: STORE_512(1, 6);
case 14: STORE_512(1, 5);
case 13: STORE_512(1, 4);
case 12: STORE_512(1, 3);
case 11: STORE_512(1, 2);
case 10: STORE_512(1, 1);
case 9: STORE_512(1, 0);
case 8: STORE_512(0, 7);
case 7: STORE_512(0, 6);
case 6: STORE_512(0, 5);
case 5: STORE_512(0, 4);
case 4: STORE_512(0, 3);
case 3: STORE_512(0, 2);
case 2: STORE_512(0, 1);
case 1: STORE_512(0, 0);
}
boffset0 += n_store * 32;
if (m & 0x1) {
__m512i tail;
GET_TAIL();
_mm256_storeu_si256((void *)boffset0, _mm512_cvtepi32_epi16(tail));
boffset0 += 16;
}
}
}
if (j < n) {
int remain_n = n - j;
__mmask16 nmask = (1UL << remain_n) - 1;
int load0, load1;
if (remain_n > 8) {
load0 = 8;
load1 = remain_n - 8;
} else {
load0 = remain_n;
load1 = 0;
}
aoffset00 = aoffset;
aoffset01 = aoffset00 + lda;
aoffset02 = aoffset01 + lda;
aoffset03 = aoffset02 + lda;
aoffset04 = aoffset03 + lda;
aoffset05 = aoffset04 + lda;
aoffset06 = aoffset05 + lda;
aoffset07 = aoffset06 + lda;
aoffset10 = aoffset07 + lda;
aoffset11 = aoffset10 + lda;
aoffset12 = aoffset11 + lda;
aoffset13 = aoffset12 + lda;
aoffset14 = aoffset13 + lda;
aoffset15 = aoffset14 + lda;
aoffset16 = aoffset15 + lda;
aoffset17 = aoffset16 + lda;
aoffset += 16 * lda;
for (i = 0; i < m32; i += 32) {
switch (load0) {
case 8: r7 = _mm512_loadu_si512(aoffset07 + i);
case 7: r6 = _mm512_loadu_si512(aoffset06 + i);
case 6: r5 = _mm512_loadu_si512(aoffset05 + i);
case 5: r4 = _mm512_loadu_si512(aoffset04 + i);
case 4: r3 = _mm512_loadu_si512(aoffset03 + i);
case 3: r2 = _mm512_loadu_si512(aoffset02 + i);
case 2: r1 = _mm512_loadu_si512(aoffset01 + i);
case 1: r0 = _mm512_loadu_si512(aoffset00 + i);
}
REORDER_8x32(t00, t01, t02, t03, t04, t05, t06, t07);
switch (load1) {
case 8: r7 = _mm512_loadu_si512(aoffset17 + i);
case 7: r6 = _mm512_loadu_si512(aoffset16 + i);
case 6: r5 = _mm512_loadu_si512(aoffset15 + i);
case 5: r4 = _mm512_loadu_si512(aoffset14 + i);
case 4: r3 = _mm512_loadu_si512(aoffset13 + i);
case 3: r2 = _mm512_loadu_si512(aoffset12 + i);
case 2: r1 = _mm512_loadu_si512(aoffset11 + i);
case 1: r0 = _mm512_loadu_si512(aoffset10 + i);
}
REORDER_8x32(t10, t11, t12, t13, t14, t15, t16, t17);
MASK_STORE_512(0, 0); MASK_STORE_512(0, 1); MASK_STORE_512(0, 2); MASK_STORE_512(0, 3);
MASK_STORE_512(0, 4); MASK_STORE_512(0, 5); MASK_STORE_512(0, 6); MASK_STORE_512(0, 7);
MASK_STORE_512(1, 0); MASK_STORE_512(1, 1); MASK_STORE_512(1, 2); MASK_STORE_512(1, 3);
MASK_STORE_512(1, 4); MASK_STORE_512(1, 5); MASK_STORE_512(1, 6); MASK_STORE_512(1, 7);
boffset0 += remain_n * 32;
}
if (i < m) {
int remain_m = m - i;
__mmask32 mmask = (1UL << remain_m) - 1;
switch (load0) {
case 8: r7 = _mm512_maskz_loadu_epi16(mmask, aoffset07 + i);
case 7: r6 = _mm512_maskz_loadu_epi16(mmask, aoffset06 + i);
case 6: r5 = _mm512_maskz_loadu_epi16(mmask, aoffset05 + i);
case 5: r4 = _mm512_maskz_loadu_epi16(mmask, aoffset04 + i);
case 4: r3 = _mm512_maskz_loadu_epi16(mmask, aoffset03 + i);
case 3: r2 = _mm512_maskz_loadu_epi16(mmask, aoffset02 + i);
case 2: r1 = _mm512_maskz_loadu_epi16(mmask, aoffset01 + i);
case 1: r0 = _mm512_maskz_loadu_epi16(mmask, aoffset00 + i);
}
REORDER_8x32(t00, t01, t02, t03, t04, t05, t06, t07);
switch (load1) {
case 8: r7 = _mm512_maskz_loadu_epi16(mmask, aoffset17 + i);
case 7: r6 = _mm512_maskz_loadu_epi16(mmask, aoffset16 + i);
case 6: r5 = _mm512_maskz_loadu_epi16(mmask, aoffset15 + i);
case 5: r4 = _mm512_maskz_loadu_epi16(mmask, aoffset14 + i);
case 4: r3 = _mm512_maskz_loadu_epi16(mmask, aoffset13 + i);
case 3: r2 = _mm512_maskz_loadu_epi16(mmask, aoffset12 + i);
case 2: r1 = _mm512_maskz_loadu_epi16(mmask, aoffset11 + i);
case 1: r0 = _mm512_maskz_loadu_epi16(mmask, aoffset10 + i);
}
REORDER_8x32(t10, t11, t12, t13, t14, t15, t16, t17);
int n_store = remain_m/2;
switch (n_store) {
case 15: MASK_STORE_512(1, 6);
case 14: MASK_STORE_512(1, 5);
case 13: MASK_STORE_512(1, 4);
case 12: MASK_STORE_512(1, 3);
case 11: MASK_STORE_512(1, 2);
case 10: MASK_STORE_512(1, 1);
case 9: MASK_STORE_512(1, 0);
case 8: MASK_STORE_512(0, 7);
case 7: MASK_STORE_512(0, 6);
case 6: MASK_STORE_512(0, 5);
case 5: MASK_STORE_512(0, 4);
case 4: MASK_STORE_512(0, 3);
case 3: MASK_STORE_512(0, 2);
case 2: MASK_STORE_512(0, 1);
case 1: MASK_STORE_512(0, 0);
}
boffset0 += n_store * remain_n * 2;
if (m & 0x1) {
__m512i tail;
GET_TAIL();
_mm256_mask_storeu_epi16((void *)boffset0, nmask, _mm512_cvtepi32_epi16(tail));
}
}
}
return 0;
}

View File

@ -0,0 +1,208 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <immintrin.h>
#include "common.h"
#define REORDER_4x32(r0, r1, r2, r3) {\
__m512i t0, t1, t2, t3; \
t0 = _mm512_unpacklo_epi32(r0, r1); \
t1 = _mm512_unpackhi_epi32(r0, r1); \
t2 = _mm512_unpacklo_epi32(r2, r3); \
t3 = _mm512_unpackhi_epi32(r2, r3); \
r0 = _mm512_unpacklo_epi64(t0, t2); \
r1 = _mm512_unpackhi_epi64(t0, t2); \
r2 = _mm512_unpacklo_epi64(t1, t3); \
r3 = _mm512_unpackhi_epi64(t1, t3); \
t0 = _mm512_permutex2var_epi32(r0, idx_lo_128, r1); \
t1 = _mm512_permutex2var_epi32(r0, idx_hi_128, r1); \
t2 = _mm512_permutex2var_epi32(r2, idx_lo_128, r3); \
t3 = _mm512_permutex2var_epi32(r2, idx_hi_128, r3); \
r0 = _mm512_permutex2var_epi32(t0, idx_lo_256, t2); \
r1 = _mm512_permutex2var_epi32(t1, idx_lo_256, t3); \
r2 = _mm512_permutex2var_epi32(t0, idx_hi_256, t2); \
r3 = _mm512_permutex2var_epi32(t1, idx_hi_256, t3); \
}
#define REORDER_4x8(r0, r1, r2, r3) {\
__m128i t0, t1, t2, t3; \
t0 = _mm_unpacklo_epi32(r0, r1); \
t1 = _mm_unpackhi_epi32(r0, r1); \
t2 = _mm_unpacklo_epi32(r2, r3); \
t3 = _mm_unpackhi_epi32(r2, r3); \
r0 = _mm_unpacklo_epi64(t0, t2); \
r1 = _mm_unpackhi_epi64(t0, t2); \
r2 = _mm_unpacklo_epi64(t1, t3); \
r3 = _mm_unpackhi_epi64(t1, t3); \
}
#define GET_TAIL(tail, remain_m) \
switch((remain_m + 1)/2) { \
case 1: tail = r0; break; \
case 2: tail = r1; break; \
case 3: tail = r2; break; \
case 4: tail = r3; break; \
}
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
BLASLONG i, j;
IFLOAT *aoffset;
IFLOAT *aoffset0, *aoffset1, *aoffset2, *aoffset3;
IFLOAT *boffset;
aoffset = a;
boffset = b;
BLASLONG m32 = m & ~31;
BLASLONG m8 = m & ~7;
BLASLONG n4 = n & ~3;
int permute_table[] = {
0x0, 0x1, 0x2, 0x3, 0x10, 0x11, 0x12, 0x13, 0x8, 0x9, 0xa, 0xb, 0x18, 0x19, 0x1a, 0x1b,
0x4, 0x5, 0x6, 0x7, 0x14, 0x15, 0x16, 0x17, 0xc, 0xd, 0xe, 0xf, 0x1c, 0x1d, 0x1e, 0x1f,
0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
};
__m512i idx_lo_128 = _mm512_loadu_si512(permute_table);
__m512i idx_hi_128 = _mm512_loadu_si512(permute_table + 16);
__m512i idx_lo_256 = _mm512_loadu_si512(permute_table + 32);
__m512i idx_hi_256 = _mm512_loadu_si512(permute_table + 48);
for (j = 0; j < n4; j += 4) {
aoffset0 = aoffset;
aoffset1 = aoffset0 + lda;
aoffset2 = aoffset1 + lda;
aoffset3 = aoffset2 + lda;
aoffset += 4 * lda;
for (i = 0; i < m32; i += 32) {
__m512i r0, r1, r2, r3;
r0 = _mm512_loadu_si512(aoffset0 + i);
r1 = _mm512_loadu_si512(aoffset1 + i);
r2 = _mm512_loadu_si512(aoffset2 + i);
r3 = _mm512_loadu_si512(aoffset3 + i);
REORDER_4x32(r0, r1, r2, r3);
_mm512_storeu_si512(boffset + 32*0, r0);
_mm512_storeu_si512(boffset + 32*1, r1);
_mm512_storeu_si512(boffset + 32*2, r2);
_mm512_storeu_si512(boffset + 32*3, r3);
boffset += 32 * 4;
}
for (; i < m8; i += 8) {
__m128i r0 = _mm_loadu_si128((void *)(aoffset0 + i));
__m128i r1 = _mm_loadu_si128((void *)(aoffset1 + i));
__m128i r2 = _mm_loadu_si128((void *)(aoffset2 + i));
__m128i r3 = _mm_loadu_si128((void *)(aoffset3 + i));
REORDER_4x8(r0, r1, r2, r3);
_mm_storeu_si128((void *)(boffset + 8*0), r0);
_mm_storeu_si128((void *)(boffset + 8*1), r1);
_mm_storeu_si128((void *)(boffset + 8*2), r2);
_mm_storeu_si128((void *)(boffset + 8*3), r3);
boffset += 8 * 4;
}
if (i < m) {
int remain_m = m - i;
__mmask8 r_mask = (1UL << remain_m) - 1;
__m128i r0 = _mm_maskz_loadu_epi16(r_mask, aoffset0 + i);
__m128i r1 = _mm_maskz_loadu_epi16(r_mask, aoffset1 + i);
__m128i r2 = _mm_maskz_loadu_epi16(r_mask, aoffset2 + i);
__m128i r3 = _mm_maskz_loadu_epi16(r_mask, aoffset3 + i);
REORDER_4x8(r0, r1, r2, r3);
// store should skip the tail odd line
int num_store = remain_m/2;
switch(num_store) {
case 3: _mm_storeu_si128((void *)(boffset + 8*2), r2);
case 2: _mm_storeu_si128((void *)(boffset + 8*1), r1);
case 1: _mm_storeu_si128((void *)(boffset + 8*0), r0);
}
boffset += 8 * num_store;
if (m & 0x1) { // handling the tail
__m128i tail;
GET_TAIL(tail, remain_m);
/* tail vector is fill with zero like:
* a, 0, b, 0, c, 0, d, 0
* need to extract lo words of data and store
*/
tail = _mm_cvtepi32_epi16(tail);
_mm_store_sd((double *)boffset, (__m128d) tail); // only lower 4 bfloat valid
boffset += 4;
}
}
}
if (j < n) {
int remain_n = n - j;
__mmask8 nmask = (1UL << remain_n) - 1;
aoffset0 = aoffset;
aoffset1 = aoffset0 + lda;
aoffset2 = aoffset1 + lda;
aoffset3 = aoffset2 + lda;
__m128i r0, r1, r2, r3;
for (i = 0; i < m8; i += 8) {
switch (remain_n) {
case 3: r2 = _mm_loadu_si128((void *)(aoffset2 + i));
case 2: r1 = _mm_loadu_si128((void *)(aoffset1 + i));
case 1: r0 = _mm_loadu_si128((void *)(aoffset0 + i));
}
REORDER_4x8(r0, r1, r2, r3);
_mm_mask_storeu_epi32(boffset + remain_n * 0, nmask, r0);
_mm_mask_storeu_epi32(boffset + remain_n * 2, nmask, r1);
_mm_mask_storeu_epi32(boffset + remain_n * 4, nmask, r2);
_mm_mask_storeu_epi32(boffset + remain_n * 6, nmask, r3);
boffset += 8 * remain_n;
}
if (i < m) {
int remain_m = m - i;
__mmask8 mmask = (1UL << remain_m) - 1;
switch (remain_n) {
case 3: r2 = _mm_maskz_loadu_epi16(mmask, aoffset2 + i);
case 2: r1 = _mm_maskz_loadu_epi16(mmask, aoffset1 + i);
case 1: r0 = _mm_maskz_loadu_epi16(mmask, aoffset0 + i);
}
REORDER_4x8(r0, r1, r2, r3);
int num_store = remain_m/2;
switch (num_store) {
case 3: _mm_mask_storeu_epi32(boffset + remain_n * 4, nmask, r2);
case 2: _mm_mask_storeu_epi32(boffset + remain_n * 2, nmask, r1);
case 1: _mm_mask_storeu_epi32(boffset + remain_n * 0, nmask, r0);
}
boffset += 2 * num_store * remain_n;
if (m & 0x1) {
__m128i tail;
GET_TAIL(tail, remain_m);
tail = _mm_cvtepi32_epi16(tail);
_mm_mask_storeu_epi16(boffset, nmask, tail);
}
}
}
return 0;
}

View File

@ -38,5 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta)
{
return 1;
double MNK = (double) M * (double) N * (double) K;
if (MNK > 256.0*256.0*256.0) // disable for big size matrix
return 0;
/* small matrix kernel works well for N = 8, 16, 32 */
if (N == 8 || N == 16 || N == 32)
return 1;
return 0;
}

View File

@ -0,0 +1,164 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <immintrin.h>
#include "common.h"
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
BLASLONG i, j;
IFLOAT *boffset0, *boffset1;
boffset0 = b;
BLASLONG n32 = n & ~31;
BLASLONG m4 = m & ~3;
BLASLONG m2 = m & ~1;
uint32_t permute_table[] = {
0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b, 0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f,
};
__m512i idx_lo = _mm512_loadu_si512(permute_table);
__m512i idx_hi = _mm512_loadu_si512(permute_table + 16);
for (j = 0; j < n32; j += 32) {
/* process 2x16 n at the same time */
boffset1 = boffset0 + m * 16;
for (i = 0; i < m4; i += 4) {
/* bf16 fma need special memory layout:
* for memory layout like below:
* a00, a01, a02, a03, a04, a05 ....
* a10, a11, a12, a13, a14, a15 ....
* need to copy as:
* a00, a10, a01, a11, a02, a12, a03, a13, ...
*/
__m512i a0 = _mm512_loadu_si512(&a[(i + 0)*lda + j]);
__m512i a1 = _mm512_loadu_si512(&a[(i + 1)*lda + j]);
__m512i a2 = _mm512_loadu_si512(&a[(i + 2)*lda + j]);
__m512i a3 = _mm512_loadu_si512(&a[(i + 3)*lda + j]);
__m512i a00 = _mm512_unpacklo_epi16(a0, a1);
__m512i a01 = _mm512_unpackhi_epi16(a0, a1);
__m512i a10 = _mm512_unpacklo_epi16(a2, a3);
__m512i a11 = _mm512_unpackhi_epi16(a2, a3);
a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01);
a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01);
a2 = _mm512_permutex2var_epi32(a10, idx_lo, a11);
a3 = _mm512_permutex2var_epi32(a10, idx_hi, a11);
_mm512_storeu_si512(boffset0, a0);
_mm512_storeu_si512(boffset1, a1);
_mm512_storeu_si512(boffset0 + 32, a2);
_mm512_storeu_si512(boffset1 + 32, a3);
boffset0 += 64;
boffset1 += 64;
}
for (; i < m2; i += 2) {
__m512i a0 = _mm512_loadu_si512(&a[(i + 0)*lda + j]);
__m512i a1 = _mm512_loadu_si512(&a[(i + 1)*lda + j]);
__m512i a00 = _mm512_unpacklo_epi16(a0, a1);
__m512i a01 = _mm512_unpackhi_epi16(a0, a1);
a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01);
a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01);
_mm512_storeu_si512(boffset0, a0);
_mm512_storeu_si512(boffset1, a1);
boffset0 += 32;
boffset1 += 32;
}
for (; i < m; i++) {
/* just copy the only remains row */
__m256i a0 = _mm256_loadu_si256((void *)&a[(i + 0)*lda + j]);
__m256i a1 = _mm256_loadu_si256((void *)&a[(i + 0)*lda + j + 16]);
_mm256_storeu_si256((void *)boffset0, a0);
_mm256_storeu_si256((void *)boffset1, a1);
boffset0 += 16;
boffset1 += 16;
}
boffset0 = boffset1;
}
if (j < n) {
uint32_t remains = n - j;
__mmask32 r_mask = (1UL << remains) - 1;
if (remains > 16) {
boffset1 = boffset0 + m * 16;
uint32_t tail1 = remains - 16;
__mmask16 w_mask1 = (1UL << tail1) - 1;
for (i = 0; i < m2; i += 2) {
__m512i a0 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]);
__m512i a1 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]);
__m512i a00 = _mm512_unpacklo_epi16(a0, a1);
__m512i a01 = _mm512_unpackhi_epi16(a0, a1);
a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01);
a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01);
_mm512_storeu_si512(boffset0, a0);
_mm512_mask_storeu_epi32(boffset1, w_mask1, a1);
boffset0 += 32;
boffset1 += 2 * tail1;
}
for (; i < m; i++) {
__m256i a0 = _mm256_loadu_si256((void *)&a[(i + 0)*lda + j]);
__m256i a1 = _mm256_maskz_loadu_epi16(w_mask1, (void *)&a[(i + 0)*lda + j + 16]);
_mm256_storeu_si256((void *)boffset0, a0);
_mm256_mask_storeu_epi16((void *)boffset1, w_mask1, a1);
boffset0 += 16;
boffset1 += tail1;
}
} else {
__mmask16 w_mask = (1UL << remains ) - 1;
for (i = 0; i < m2; i += 2) {
__m512i a0 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]);
__m512i a1 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]);
__m512i a00 = _mm512_unpacklo_epi16(a0, a1);
__m512i a01 = _mm512_unpackhi_epi16(a0, a1);
a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01);
_mm512_mask_storeu_epi32(boffset0, w_mask, a0);
boffset0 += 2 * remains;
}
for (; i < m; i++) {
__m256i a0 = _mm256_maskz_loadu_epi16(w_mask, &a[(i + 0)*lda + j]);
_mm256_mask_storeu_epi16(boffset0, w_mask, a0);
boffset0 += remains;
}
}
}
return 0;
}

View File

@ -0,0 +1,216 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <immintrin.h>
#include "common.h"
#define STORE_VEC(Bx, By, vec) \
if (By == 0) asm("vmovdqu16 %0, (%1)": : "v"(vec), "r"(boffset##Bx)); \
else asm("vmovdqu16 %0, (%1, %2, %c3)": : "v"(vec), "r"(boffset##Bx), "r"(blk_size), "n"(By * 2));
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
BLASLONG i, j;
IFLOAT *boffset0, *boffset1;
boffset0 = b;
BLASLONG n24 = n - (n % 24);
BLASLONG n8 = n & ~7;
BLASLONG m8 = m & ~7;
BLASLONG m4 = m & ~3;
BLASLONG m2 = m & ~1;
int permute_table[] = {
0x0, 0x1, 0x2, 0x3, 0x10, 0x11, 0x12, 0x13, 0x8, 0x9, 0xa, 0xb, 0x18, 0x19, 0x1a, 0x1b,
0x4, 0x5, 0x6, 0x7, 0x14, 0x15, 0x16, 0x17, 0xc, 0xd, 0xe, 0xf, 0x1c, 0x1d, 0x1e, 0x1f,
0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
};
j = 0;
if (n > 23) {
/* n = 24 is the max width in current blocking setting */
__m512i idx_lo_128 = _mm512_loadu_si512(permute_table);
__m512i idx_hi_128 = _mm512_loadu_si512(permute_table + 16);
__m512i idx_lo_256 = _mm512_loadu_si512(permute_table + 32);
__m512i idx_hi_256 = _mm512_loadu_si512(permute_table + 48);
__mmask32 mask24 = (1UL << 24) - 1;
BLASLONG blk_size = m * 4;
BLASLONG stride = blk_size * 3;
for (; j < n24; j += 24) {
boffset1 = boffset0 + stride;
for (i = 0; i < m8; i += 8) {
__m512i r0, r1, r2, r3, r4, r5, r6, r7;
__m512i t0, t1, t2, t3, t4, t5, t6, t7;
r0 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 0)*lda + j]);
r1 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 1)*lda + j]);
r2 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 2)*lda + j]);
r3 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 3)*lda + j]);
r4 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 4)*lda + j]);
r5 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 5)*lda + j]);
r6 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 6)*lda + j]);
r7 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 7)*lda + j]);
t0 = _mm512_unpacklo_epi16(r0, r1);
t1 = _mm512_unpackhi_epi16(r0, r1);
t2 = _mm512_unpacklo_epi16(r2, r3);
t3 = _mm512_unpackhi_epi16(r2, r3);
t4 = _mm512_unpacklo_epi16(r4, r5);
t5 = _mm512_unpackhi_epi16(r4, r5);
t6 = _mm512_unpacklo_epi16(r6, r7);
t7 = _mm512_unpackhi_epi16(r6, r7);
r0 = _mm512_permutex2var_epi32(t0, idx_lo_128, t2);
r1 = _mm512_permutex2var_epi32(t1, idx_lo_128, t3);
r2 = _mm512_permutex2var_epi32(t4, idx_lo_128, t6);
r3 = _mm512_permutex2var_epi32(t5, idx_lo_128, t7);
r4 = _mm512_permutex2var_epi32(t0, idx_hi_128, t2);
r5 = _mm512_permutex2var_epi32(t1, idx_hi_128, t3);
r6 = _mm512_permutex2var_epi32(t4, idx_hi_128, t6);
r7 = _mm512_permutex2var_epi32(t5, idx_hi_128, t7);
t0 = _mm512_permutex2var_epi32(r0, idx_lo_256, r2);
t1 = _mm512_permutex2var_epi32(r1, idx_lo_256, r3);
t2 = _mm512_permutex2var_epi32(r4, idx_lo_256, r6);
t3 = _mm512_permutex2var_epi32(r5, idx_lo_256, r7);
t4 = _mm512_permutex2var_epi32(r0, idx_hi_256, r2);
t5 = _mm512_permutex2var_epi32(r1, idx_hi_256, r3);
STORE_VEC(0, 0, t0); STORE_VEC(0, 1, t1); STORE_VEC(0, 2, t2);
STORE_VEC(1, 0, t3); STORE_VEC(1, 1, t4); STORE_VEC(1, 2, t5);
boffset0 += 32;
boffset1 += 32;
}
for (; i < m2; i += 2) {
__m512i r0, r1, t0, t1;
r0 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 0)*lda + j]);
r1 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 1)*lda + j]);
t0 = _mm512_unpacklo_epi16(r0, r1);
t1 = _mm512_unpackhi_epi16(r0, r1);
STORE_VEC(0, 0, _mm512_extracti32x4_epi32(t0, 0));
STORE_VEC(0, 1, _mm512_extracti32x4_epi32(t1, 0));
STORE_VEC(0, 2, _mm512_extracti32x4_epi32(t0, 1));
STORE_VEC(1, 0, _mm512_extracti32x4_epi32(t1, 1));
STORE_VEC(1, 1, _mm512_extracti32x4_epi32(t0, 2));
STORE_VEC(1, 2, _mm512_extracti32x4_epi32(t1, 2));
boffset0 += 8;
boffset1 += 8;
}
for (; i < m; i++) {
*(uint64_t *)(boffset0 + blk_size * 0) = *(uint64_t *)&a[i * lda + j + 0];
*(uint64_t *)(boffset0 + blk_size * 1) = *(uint64_t *)&a[i * lda + j + 4];
*(uint64_t *)(boffset0 + blk_size * 2) = *(uint64_t *)&a[i * lda + j + 8];
*(uint64_t *)(boffset1 + blk_size * 0) = *(uint64_t *)&a[i * lda + j + 12];
*(uint64_t *)(boffset1 + blk_size * 1) = *(uint64_t *)&a[i * lda + j + 16];
*(uint64_t *)(boffset1 + blk_size * 2) = *(uint64_t *)&a[i * lda + j + 20];
boffset0 += 4;
boffset1 += 4;
}
boffset0 += stride * 2;
}
}
for (; j < n8; j += 8) {
boffset1 = boffset0 + m * 4;
for (i = 0; i < m4; i += 4) {
__m128i a0 = _mm_loadu_si128((void *)&a[(i + 0)*lda + j]);
__m128i a1 = _mm_loadu_si128((void *)&a[(i + 1)*lda + j]);
__m128i a2 = _mm_loadu_si128((void *)&a[(i + 2)*lda + j]);
__m128i a3 = _mm_loadu_si128((void *)&a[(i + 3)*lda + j]);
__m128i a00 = _mm_unpacklo_epi16(a0, a1);
__m128i a01 = _mm_unpackhi_epi16(a0, a1);
__m128i a10 = _mm_unpacklo_epi16(a2, a3);
__m128i a11 = _mm_unpackhi_epi16(a2, a3);
_mm_storeu_si128((void *)(boffset0 + 0), a00);
_mm_storeu_si128((void *)(boffset0 + 8), a10);
_mm_storeu_si128((void *)(boffset1 + 0), a01);
_mm_storeu_si128((void *)(boffset1 + 8), a11);
boffset0 += 16;
boffset1 += 16;
}
for (; i < m2; i+= 2) {
__m128i a0 = _mm_loadu_si128((void *)&a[(i + 0)*lda + j]);
__m128i a1 = _mm_loadu_si128((void *)&a[(i + 1)*lda + j]);
__m128i a00 = _mm_unpacklo_epi16(a0, a1);
__m128i a01 = _mm_unpackhi_epi16(a0, a1);
_mm_storeu_si128((void *)(boffset0 + 0), a00);
_mm_storeu_si128((void *)(boffset1 + 0), a01);
boffset0 += 8;
boffset1 += 8;
}
for (; i < m; i++) {
__m128d a0 = _mm_loadu_pd((void *)&a[(i + 0)*lda + j]);
_mm_store_sd((void *)boffset0, a0);
_mm_store_sd((void *)boffset1, _mm_permute_pd(a0, 0x1));
boffset0 += 4;
boffset1 += 4;
}
boffset0 = boffset1;
}
if (j < n) {
uint32_t remains = n - j;
__mmask8 r_mask = (1UL << remains) - 1;
if (remains > 4) {
boffset1 = boffset0 + m * 4;
uint32_t tail1 = remains - 4;
__mmask8 w_mask1 = (1UL << tail1) - 1;
for (i = 0; i < m2; i += 2) {
__m128i a0 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]);
__m128i a1 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]);
__m128i a00 = _mm_unpacklo_epi16(a0, a1);
__m128i a01 = _mm_unpackhi_epi16(a0, a1);
_mm_storeu_si128((void *)boffset0, a00);
_mm_mask_storeu_epi32((void *)boffset1, w_mask1, a01);
boffset0 += 8;
boffset1 += 2 * tail1;
}
for (; i < m; i++) {
__m128i a0 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]);
_mm_store_sd((void *)boffset0, (__m128d) a0);
_mm_mask_storeu_epi16((void *)boffset1, w_mask1, (__m128i) _mm_permute_pd((__m128d) a0, 0x1));
boffset0 += 4;
boffset1 += tail1;
}
} else {
for (i = 0; i < m2; i += 2) {
__m128i a0 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]);
__m128i a1 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]);
__m128i a00 = _mm_unpacklo_epi16(a0, a1);
_mm_mask_storeu_epi32((void *)boffset0, r_mask, a00);
boffset0 += 2 * remains;
}
for (; i < m; i++) {
__m128i a0 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]);
_mm_mask_storeu_epi16((void *)boffset0, r_mask, a0);
}
}
}
return 0;
}

View File

@ -30,6 +30,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// Include common macros for BF16 based operations with IA intrinsics
#include "bf16_common_macros.h"
#undef STORE16_COMPLETE_RESULT
#undef STORE16_MASK_COMPLETE_RESULT
#undef STORE8_COMPLETE_RESULT
#undef STORE8_MASK_COMPLETE_RESULT
#undef STORE4_COMPLETE_RESULT
#undef STORE4_MASK_COMPLETE_RESULT
#ifndef ZERO_BETA // Beta is non-zero
#ifndef ONE_BETA // BETA is not ONE
@ -103,7 +110,9 @@ static int sbgemv_kernel_32xN_lda_direct(BLASLONG m, BLASLONG n, float alpha, bf
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
#endif
#ifndef ZERO_BETA
#ifndef ONE_BETA
__m512 BETAVECTOR = _mm512_set1_ps(beta);
#endif
#endif
__m512i matrixArray_seed_0, matrixArray_seed_1, matrixArray_seed_2, matrixArray_seed_3;
@ -202,7 +211,7 @@ static int sbgemv_kernel_32xN_lda_direct(BLASLONG m, BLASLONG n, float alpha, bf
unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(m&31)));
__mmask32 tail_mask = *((__mmask32*) &tail_mask_value);
unsigned short store_tail_mask_value = (((unsigned int)0xffff) >> (16-(m&15)));
unsigned int store_tail_mask_value = (((unsigned int)0xffff) >> (16-(m&15)));
__mmask32 store_tail_mask = *((__mmask32*) &store_tail_mask_value);
accum512_0 = _mm512_setzero_ps();

View File

@ -29,6 +29,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// Include common macros for BF16 based operations with IA intrinsics
#include "bf16_common_macros.h"
#undef STORE16_COMPLETE_RESULT
#undef STORE16_MASK_COMPLETE_RESULT
#undef STORE8_COMPLETE_RESULT
#undef STORE8_MASK_COMPLETE_RESULT
#undef STORE4_COMPLETE_RESULT
#undef STORE4_MASK_COMPLETE_RESULT
#ifndef ZERO_BETA // Beta is non-zero
#ifndef ONE_BETA // BETA is not ONE
@ -231,7 +238,9 @@ static int sbgemv_kernel_32x2(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
#endif
#ifndef ZERO_BETA
#ifndef ONE_BETA
__m512 BETAVECTOR = _mm512_set1_ps(beta);
#endif
#endif
unsigned char load_mask_value = (((unsigned char)0xff) >> 6);
@ -280,7 +289,7 @@ static int sbgemv_kernel_32x2(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
} else if (tail_num == 8) {
__m256 result256 = _mm256_setzero_ps();
__m256i matrixArray256 = _mm256_loadu_si256(&a[(tag_m_32x)*2]); // Load 8 rows with n=2
__m256i matrixArray256 = _mm256_loadu_si256((__m256i *)&a[(tag_m_32x)*2]); // Load 8 rows with n=2
__m256i xArray256 = _mm512_castsi512_si256(xArray);
result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) xArray256);
@ -323,7 +332,9 @@ static int sbgemv_kernel_32x3(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
#endif
#ifndef ZERO_BETA
#ifndef ONE_BETA
__m512 BETAVECTOR = _mm512_set1_ps(beta);
#endif
#endif
unsigned char x_load_mask_value = (((unsigned char)0xff) >> 5);
@ -395,9 +406,9 @@ static int sbgemv_kernel_32x3(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
result256_0 = _mm256_setzero_ps();
result256_1 = _mm256_setzero_ps();
matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element
matrixArray256_1 = _mm256_loadu_si256(&a[((tag_m_32x+5)*3 + 1)]); // Load 5 rows with n=3 plus 1 element
matrixArray256_2 = _mm256_loadu_si256(&a[((tag_m_32x+10)*3 + 2)]); // Load 5 rows with n=3 plus 1 element
matrixArray256_0 = _mm256_loadu_si256((__m256i *)&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element
matrixArray256_1 = _mm256_loadu_si256((__m256i *)&a[((tag_m_32x+5)*3 + 1)]); // Load 5 rows with n=3 plus 1 element
matrixArray256_2 = _mm256_loadu_si256((__m256i *)&a[((tag_m_32x+10)*3 + 2)]); // Load 5 rows with n=3 plus 1 element
matrixArray256_3 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx01_1st, matrixArray256_1); // Select the first 2 elements for each row
matrixArray256_4 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx01_2nd, matrixArray256_2); // Select the first 2 elements for each row
@ -423,8 +434,8 @@ static int sbgemv_kernel_32x3(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
if (tail_num > 10) {
unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-((tail_num-10-1)*3+1)));
__mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element
matrixArray256_1 = _mm256_loadu_si256(&a[((tag_m_32x+5)*3 + 1)]); // Load 5 rows with n=3 plus 1 element
matrixArray256_0 = _mm256_loadu_si256((__m256i *)&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element
matrixArray256_1 = _mm256_loadu_si256((__m256i *)&a[((tag_m_32x+5)*3 + 1)]); // Load 5 rows with n=3 plus 1 element
matrixArray256_2 = _mm256_maskz_loadu_epi16(tail_mask, &a[((tag_m_32x+10)*3 + 2)]); // Load m-tag_m_32x-10 rows
matrixArray256_3 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx01_1st, matrixArray256_1); // Select the first 2 elements for each row
@ -439,7 +450,7 @@ static int sbgemv_kernel_32x3(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
} else if (tail_num > 5) {
unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-((tail_num-5-1)*3+2)));
__mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element
matrixArray256_0 = _mm256_loadu_si256((__m256i *)&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element
matrixArray256_1 = _mm256_maskz_loadu_epi16(tail_mask, &a[((tag_m_32x+5)*3+1)]); // Load m-tag_m_32x-5 rows
matrixArray256_2 = _mm256_setzero_si256();
@ -499,7 +510,9 @@ static int sbgemv_kernel_16x4(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
#endif
#ifndef ZERO_BETA
#ifndef ONE_BETA
__m512 BETAVECTOR = _mm512_set1_ps(beta);
#endif
#endif
__m512i M512_EPI32_1 = _mm512_set1_epi32(1);
@ -591,7 +604,9 @@ static int sbgemv_kernel_30x5(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
#endif
#ifndef ZERO_BETA
#ifndef ONE_BETA
__m512 BETAVECTOR = _mm512_set1_ps(beta);
#endif
#endif
__m512 result_0, result_1;
@ -782,7 +797,9 @@ static int sbgemv_kernel_16x6(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
#endif
#ifndef ZERO_BETA
#ifndef ONE_BETA
__m512 BETAVECTOR = _mm512_set1_ps(beta);
#endif
#endif
__m512i M512_EPI32_1 = _mm512_set1_epi32(1);
@ -866,9 +883,9 @@ static int sbgemv_kernel_16x6(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
result256_0 = _mm256_setzero_ps();
matrixArray_0 = _mm256_loadu_si256(&a[(tag_m_16x)*6]); // Load 2 rows with n=6 plus 4 element
matrixArray_1 = _mm256_loadu_si256(&a[((tag_m_16x+2)*6 + 4)]); // Load 2 rows with n=6 plus 4 element
matrixArray_2 = _mm256_loadu_si256(&a[((tag_m_16x+5)*6 + 2)]); // Load 2 rows with n=6 plus 4 element
matrixArray_0 = _mm256_loadu_si256((__m256i *)&a[(tag_m_16x)*6]); // Load 2 rows with n=6 plus 4 element
matrixArray_1 = _mm256_loadu_si256((__m256i *)&a[((tag_m_16x+2)*6 + 4)]); // Load 2 rows with n=6 plus 4 element
matrixArray_2 = _mm256_loadu_si256((__m256i *)&a[((tag_m_16x+5)*6 + 2)]); // Load 2 rows with n=6 plus 4 element
// Process the 0|1 elements
// Select the 0|1 elements for each row
@ -957,7 +974,9 @@ static int sbgemv_kernel_16x7(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
#endif
#ifndef ZERO_BETA
#ifndef ONE_BETA
__m512 BETAVECTOR = _mm512_set1_ps(beta);
#endif
#endif
__m512i M512_EPI32_2 = _mm512_set1_epi32(2);
@ -1110,7 +1129,7 @@ static int sbgemv_kernel_16x8(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
{
BLASLONG tag_m_16x = m & (~15);
__m128i x128 = _mm_loadu_si128(x); // |x0|x1|x2|x3|x4|x5|x6|x7|
__m128i x128 = _mm_loadu_si128((__m128i *)x); // |x0|x1|x2|x3|x4|x5|x6|x7|
if (tag_m_16x > 0) {
__m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3;
@ -1122,7 +1141,9 @@ static int sbgemv_kernel_16x8(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
#endif
#ifndef ZERO_BETA
#ifndef ONE_BETA
__m512 BETAVECTOR = _mm512_set1_ps(beta);
#endif
#endif
__m512i M512_EPI32_2 = _mm512_set1_epi32(2);
@ -1214,7 +1235,7 @@ static int sbgemv_kernel_16x8(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
__m128 result128, tmp128;
for (BLASLONG i = tag_m_16x; i < m; i++) {
result128 = _mm_setzero_ps();
matrixArray128 = _mm_loadu_si128(&a[(i)*8]); // Load 1 rows with n=8
matrixArray128 = _mm_loadu_si128((__m128i *)&a[(i)*8]); // Load 1 rows with n=8
result128 = _mm_dpbf16_ps(result128, (__m128bh) matrixArray128, (__m128bh) x128);
tmp128 = _mm_shuffle_ps(result128, result128, 14);
result128 = _mm_add_ps(result128, tmp128);
@ -1258,7 +1279,7 @@ static int sbgemv_kernel_14x9(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
unsigned char x_load_mask_value = (((unsigned char)0xff) >> 7);
__mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
__m128i x128_0 = _mm_loadu_si128(x); // |x0|x1|x2|x3|x4|x5|x6|x7|
__m128i x128_0 = _mm_loadu_si128((__m128i *)x); // |x0|x1|x2|x3|x4|x5|x6|x7|
__m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8)); // |x8|0 |0 | 0| 0| 0| 0| 0|
if (tag_m_14x > 0) {
@ -1271,7 +1292,9 @@ static int sbgemv_kernel_14x9(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
#endif
#ifndef ZERO_BETA
#ifndef ONE_BETA
__m512 BETAVECTOR = _mm512_set1_ps(beta);
#endif
#endif
__m256i M256_EPI16_2 = _mm256_set1_epi16(2);
@ -1390,7 +1413,7 @@ static int sbgemv_kernel_12x10(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
unsigned char x_load_mask_value = (((unsigned char)0xf) >> 3);
__mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
__m128i x128_0 = _mm_loadu_si128(x); // |x0|x1|x2|x3|x4|x5|x6|x7|
__m128i x128_0 = _mm_loadu_si128((__m128i *)x); // |x0|x1|x2|x3|x4|x5|x6|x7|
__m128i x128_1 = _mm_maskz_loadu_epi32(x_load_mask, (x+8)); // |x8|x9|0 | 0| 0| 0| 0| 0|
if (tag_m_12x > 0) {
@ -1403,7 +1426,9 @@ static int sbgemv_kernel_12x10(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
#endif
#ifndef ZERO_BETA
#ifndef ONE_BETA
__m512 BETAVECTOR = _mm512_set1_ps(beta);
#endif
#endif
__m256i M256_EPI32_1 = _mm256_set1_epi32(1);
@ -1522,7 +1547,7 @@ static int sbgemv_kernel_15x11(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
unsigned char x_load_mask_value = (((unsigned char)0xff) >> 5);
__mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
__m128i x128_0 = _mm_loadu_si128(x); // |x0|x1| x2|x3|x4|x5|x6|x7|
__m128i x128_0 = _mm_loadu_si128((__m128i *)x); // |x0|x1| x2|x3|x4|x5|x6|x7|
__m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8)); // |x8|x9|x10| 0| 0| 0| 0| 0|
if (tag_m_15x > 0) {
@ -1535,7 +1560,9 @@ static int sbgemv_kernel_15x11(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
#endif
#ifndef ZERO_BETA
#ifndef ONE_BETA
__m512 BETAVECTOR = _mm512_set1_ps(beta);
#endif
#endif
__m512i idx_stage1_base_0, idx_stage1_base_1, idx_stage1_base_2, idx_stage1_base_3, idx_stage1_base_4, idx_stage1_base_5;
@ -1690,7 +1717,7 @@ static int sbgemv_kernel_15x12(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
unsigned char x_load_mask_value = (((unsigned char)0xff) >> 4);
__mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
__m128i x128_0 = _mm_loadu_si128(x); // |x0|x1| x2| x3|x4|x5|x6|x7|
__m128i x128_0 = _mm_loadu_si128((__m128i *)x); // |x0|x1| x2| x3|x4|x5|x6|x7|
__m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8)); // |x8|x9|x10|x11| 0| 0| 0| 0|
if (tag_m_15x > 0) {
@ -1703,7 +1730,9 @@ static int sbgemv_kernel_15x12(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
#endif
#ifndef ZERO_BETA
#ifndef ONE_BETA
__m512 BETAVECTOR = _mm512_set1_ps(beta);
#endif
#endif
__m512i idx_stage1_base_0, idx_stage1_base_1, idx_stage1_base_2, idx_stage1_base_3, idx_stage1_base_4, idx_stage1_base_5;
@ -1873,16 +1902,15 @@ static int sbgemv_kernel_16x13(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
#endif
#ifndef ZERO_BETA
#ifndef ONE_BETA
__m512 BETAVECTOR = _mm512_set1_ps(beta);
#endif
#endif
__m512i M512_EPI32_4 = _mm512_set1_epi32(4);
__m512i idx_base_0 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0);
__m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_4);
unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 6);
__mmask32 load_mask = *((__mmask32*) &load_mask_value);
// Prepare X with 2-step interleave way
xArray_0 = _mm512_inserti32x8(_mm512_castsi256_si512(x256), x256, 0x1);
BF16_INTERLEAVE_1x32(xArray)
@ -2045,7 +2073,9 @@ static int sbgemv_kernel_16x14(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
#endif
#ifndef ZERO_BETA
#ifndef ONE_BETA
__m512 BETAVECTOR = _mm512_set1_ps(beta);
#endif
#endif
__m512i M512_EPI32_4 = _mm512_set1_epi32(4);
@ -2207,16 +2237,15 @@ static int sbgemv_kernel_16x15(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
#endif
#ifndef ZERO_BETA
#ifndef ONE_BETA
__m512 BETAVECTOR = _mm512_set1_ps(beta);
#endif
#endif
__m512i M512_EPI32_4 = _mm512_set1_epi32(4);
__m512i idx_base_0 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0);
__m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_4);
unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 2);
__mmask32 load_mask = *((__mmask32*) &load_mask_value);
// Prepare X with 2-step interleave way
xArray_0 = _mm512_inserti32x8(_mm512_castsi256_si512(x256), x256, 0x1);
BF16_INTERLEAVE_1x32(xArray)
@ -2364,7 +2393,7 @@ static int sbgemv_kernel_16x16(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
{
BLASLONG tag_m_16x = m & (~15);
__m256i x256 = _mm256_loadu_si256(x); // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|x13|x14|x15|
__m256i x256 = _mm256_loadu_si256((__m256i *)x); // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|x13|x14|x15|
if (tag_m_16x > 0) {
__m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7, \
@ -2377,7 +2406,9 @@ static int sbgemv_kernel_16x16(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
#endif
#ifndef ZERO_BETA
#ifndef ONE_BETA
__m512 BETAVECTOR = _mm512_set1_ps(beta);
#endif
#endif
__m512i M512_EPI32_4 = _mm512_set1_epi32(4);
@ -2484,7 +2515,7 @@ static int sbgemv_kernel_16x16(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
__m128 accum128, tmp128;
for (BLASLONG i = tag_m_16x; i < m; i++) {
accum256 = _mm256_setzero_ps();
matrixArray256 = _mm256_loadu_si256(&a[(i)*16]); // Load 1 rows with n=16
matrixArray256 = _mm256_loadu_si256((__m256i *)&a[(i)*16]); // Load 1 rows with n=16
accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) matrixArray256, (__m256bh) x256);
accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1));
tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e);
@ -2535,7 +2566,9 @@ static int sbgemv_kernel_8x16p_lda(BLASLONG m, BLASLONG n, float alpha, bfloat16
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
#endif
#ifndef ZERO_BETA
#ifndef ONE_BETA
__m512 BETAVECTOR = _mm512_set1_ps(beta);
#endif
#endif
__m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7, \
@ -2647,8 +2680,6 @@ static int sbgemv_kernel_1x128_lda_direct(BLASLONG m, BLASLONG n, float alpha, b
BLASLONG tag_n_32x = n & (~31);
BLASLONG tag_n_128x = n & (~127);
__m512 accum512_0, accum512_1, accum512_2, accum512_3, accum512_4, accum512_5, accum512_6, accum512_7, \
accum512_8, accum512_9, accum512_10, accum512_11, accum512_12, accum512_13, accum512_14, accum512_15;
__m512 accum512_bridge[8];
__m512 accum512_t_0, accum512_t_1, accum512_t_2, accum512_t_3;
__m256 accum256_0;
@ -2658,7 +2689,9 @@ static int sbgemv_kernel_1x128_lda_direct(BLASLONG m, BLASLONG n, float alpha, b
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
#endif
#ifndef ZERO_BETA
#ifndef ONE_BETA
__m512 BETAVECTOR = _mm512_set1_ps(beta);
#endif
#endif
__m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3;
@ -2825,7 +2858,9 @@ static int sbgemv_kernel_8x32_lda_direct(BLASLONG m, BLASLONG n, float alpha, bf
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
#endif
#ifndef ZERO_BETA
#ifndef ONE_BETA
__m512 BETAVECTOR = _mm512_set1_ps(beta);
#endif
#endif
__m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7;
@ -2961,7 +2996,9 @@ static int sbgemv_kernel_8x16m_lda(BLASLONG m, BLASLONG n, float alpha, bfloat16
__m512 ALPHAVECTOR = _mm512_castps256_ps512(_mm256_set1_ps(alpha));
#endif
#ifndef ZERO_BETA
#ifndef ONE_BETA
__m512 BETAVECTOR = _mm512_castps256_ps512(_mm256_set1_ps(beta));
#endif
#endif
__m256 accum256_0, accum256_1, accum256_2, accum256_3, accum256_4, accum256_5, accum256_6, accum256_7, \
@ -3012,7 +3049,7 @@ static int sbgemv_kernel_8x16m_lda(BLASLONG m, BLASLONG n, float alpha, bfloat16
__m128 accum128, tmp128;
for (BLASLONG i = tag_m_8x; i < m; i++) {
accum256_0 = _mm256_setzero_ps();
matrixArray_0 = _mm256_loadu_si256(&a[(i)*lda]); // Load 1 rows with n=16
matrixArray_0 = _mm256_loadu_si256((__m256i *)&a[(i)*lda]); // Load 1 rows with n=16
accum256_0 = _mm256_dpbf16_ps(accum256_0, (__m256bh) matrixArray_0, (__m256bh) xArray256);
accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1));
tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e);

View File

@ -41,7 +41,7 @@
#include <immintrin.h>
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
FLOAT *dummy2, BLASLONG dummy3, FLOAT *dummy4, BLASLONG dummy5,
IFLOAT *dummy2, BLASLONG dummy3, IFLOAT *dummy4, BLASLONG dummy5,
FLOAT *c, BLASLONG ldc){
BLASLONG i, j;

View File

@ -115,6 +115,8 @@ static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT
#endif
#ifndef HAVE_SGEMV_N_SKYLAKE_KERNEL
#ifndef HAVE_KERNEL_4x2
static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
@ -170,6 +172,7 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
}
#endif
#endif
#ifndef HAVE_KERNEL_4x1

View File

@ -16,7 +16,7 @@ static FLOAT zasum_kernel(BLASLONG n, FLOAT *x)
if (n2 < 32) {
__m128d accum_10, accum_11, accum_12, accum_13;
__m128d abs_mask1;
__m128d abs_mask1 = abs_mask1;
accum_10 = _mm_setzero_pd();
accum_11 = _mm_setzero_pd();

View File

@ -351,7 +351,7 @@
*
* Quick return if possible
*
IF( N.LE.0 ) THEN
IF( (N.LE.0) .OR. (M.LE.0) ) THEN
RETURN
END IF
*

View File

@ -353,7 +353,7 @@
*
* Quick return if possible
*
IF( N.LE.0 ) THEN
IF( (N.LE.0).OR.(M.LE.0) ) THEN
RETURN
END IF
*

View File

@ -353,7 +353,7 @@
*
* Quick return if possible
*
IF( N.LE.0 ) THEN
IF( (N.LE.0).OR.(M.LE.0) ) THEN
RETURN
END IF
*

View File

@ -351,7 +351,7 @@
*
* Quick return if possible
*
IF( N.LE.0 ) THEN
IF( (N.LE.0).OR.(M.LE.0) ) THEN
RETURN
END IF
*

16
param.h
View File

@ -1771,6 +1771,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#define USE_SGEMM_KERNEL_DIRECT 1
#undef SBGEMM_DEFAULT_UNROLL_N
#undef SBGEMM_DEFAULT_UNROLL_M
#undef SBGEMM_DEFAULT_P
#undef SBGEMM_DEFAULT_R
#undef SBGEMM_DEFAULT_Q
#define SBGEMM_DEFAULT_UNROLL_N 4
#define SBGEMM_DEFAULT_UNROLL_M 16
#define SBGEMM_DEFAULT_P 384
#define SBGEMM_DEFAULT_Q 768
#define SBGEMM_DEFAULT_R sbgemm_r
#ifdef ARCH_X86
#define SGEMM_DEFAULT_UNROLL_M 4
@ -2454,13 +2465,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SGEMM_DEFAULT_UNROLL_M 16
#define SGEMM_DEFAULT_UNROLL_N 8
#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
#define DGEMM_DEFAULT_UNROLL_M 16
#define DGEMM_DEFAULT_UNROLL_N 4
#else
#define DGEMM_DEFAULT_UNROLL_M 8
#define DGEMM_DEFAULT_UNROLL_N 8
#endif
#define CGEMM_DEFAULT_UNROLL_M 8
#define CGEMM_DEFAULT_UNROLL_N 4
#define ZGEMM_DEFAULT_UNROLL_M 8