Compare commits
108 Commits
revert-205
...
revert-232
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
83dae28ae2 | ||
|
|
da986d2e83 | ||
|
|
6bc487de35 | ||
|
|
f95989cbc1 | ||
|
|
04226f1e97 | ||
|
|
0925ef70db | ||
|
|
371e6f73d4 | ||
|
|
d117dfd505 | ||
|
|
883c39773a | ||
|
|
b09b5be0a4 | ||
|
|
bfb5fbdb4d | ||
|
|
3da6d66da9 | ||
|
|
08fa83aba2 | ||
|
|
63d3ee8dfc | ||
|
|
1191db1a49 | ||
|
|
1f6071590d | ||
|
|
0caf1434c9 | ||
|
|
73128f3883 | ||
|
|
cad0d150db | ||
|
|
eba0aeb7cd | ||
|
|
0c07c356c1 | ||
|
|
82b75f97e5 | ||
|
|
7887c45077 | ||
|
|
3e67017ac8 | ||
|
|
b3ac6ee222 | ||
|
|
6082e556cd | ||
|
|
92315173d5 | ||
|
|
351d12b94e | ||
|
|
bf73aa141b | ||
|
|
71e96163db | ||
|
|
819e852ae7 | ||
|
|
4e466d739c | ||
|
|
4c6a457358 | ||
|
|
836c414e22 | ||
|
|
d403eb3c2f | ||
|
|
3cd97f1a80 | ||
|
|
9955f0996f | ||
|
|
430c11e135 | ||
|
|
fbacd2605d | ||
|
|
6fa89b06a1 | ||
|
|
68597002ea | ||
|
|
d2a6285549 | ||
|
|
d999688d1a | ||
|
|
928fe1b28e | ||
|
|
ccc28c6d60 | ||
|
|
ae43b75a6a | ||
|
|
54fc06fd70 | ||
|
|
1df9a2013d | ||
|
|
274ff5cdb8 | ||
|
|
eb2eddf241 | ||
|
|
8691825944 | ||
|
|
7dc8a76f60 | ||
|
|
df857551c0 | ||
|
|
85ccdce8c4 | ||
|
|
aeabe0a83f | ||
|
|
1b90989662 | ||
|
|
e3e8b5cdca | ||
|
|
69b16a894d | ||
|
|
6782e5767d | ||
|
|
48f5a89f92 | ||
|
|
4ae1610f37 | ||
|
|
911c3e2f4b | ||
|
|
fab49e49e5 | ||
|
|
b687fba5bc | ||
|
|
46a8c2519a | ||
|
|
e9437eebd2 | ||
|
|
3a39062cfc | ||
|
|
eaa0be1313 | ||
|
|
6ff013bae0 | ||
|
|
0d669e04bb | ||
|
|
17cdd9f9e1 | ||
|
|
6bcb06fcb1 | ||
|
|
b7315f8401 | ||
|
|
9b19e9e1b0 | ||
|
|
6bd67ddbab | ||
|
|
5da9484d93 | ||
|
|
844629af57 | ||
|
|
2beaa82c05 | ||
|
|
e8a2aed2b9 | ||
|
|
f262031685 | ||
|
|
5f6206fa2d | ||
|
|
f2cde2ccfb | ||
|
|
ba7838d2e1 | ||
|
|
a448884a63 | ||
|
|
17609f88f1 | ||
|
|
3a2df19db6 | ||
|
|
d2093a40d3 | ||
|
|
aa04b0925e | ||
|
|
258ac56e0a | ||
|
|
56837e9d92 | ||
|
|
bb5413863f | ||
|
|
32f5907fef | ||
|
|
ac10236cc8 | ||
|
|
8617d75548 | ||
|
|
c07d78b9e9 | ||
|
|
6355c25dde | ||
|
|
5e244d80f2 | ||
|
|
ede5efebab | ||
|
|
84908d60d2 | ||
|
|
596a22325a | ||
|
|
7f58f3ad0e | ||
|
|
c0d570a357 | ||
|
|
6b83079368 | ||
|
|
673e5a0495 | ||
|
|
bfa2cc7d64 | ||
|
|
e7c4d6705a | ||
|
|
2a1911cc14 | ||
|
|
256fc15f5f |
32
.travis.yml
32
.travis.yml
@@ -17,7 +17,7 @@ matrix:
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
|
||||
script:
|
||||
- set -e
|
||||
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
- make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
- make -C test $COMMON_FLAGS $BTYPE
|
||||
- make -C ctest $COMMON_FLAGS $BTYPE
|
||||
- make -C utest $COMMON_FLAGS $BTYPE
|
||||
@@ -25,14 +25,14 @@ matrix:
|
||||
- TARGET_BOX=LINUX64
|
||||
- BTYPE="BINARY=64"
|
||||
|
||||
# - <<: *test-ubuntu
|
||||
# os: linux-ppc64le
|
||||
# before_script:
|
||||
# - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32"
|
||||
# env:
|
||||
# # for matrix annotation only
|
||||
# - TARGET_BOX=PPC64LE_LINUX
|
||||
# - BTYPE="BINARY=64 USE_OPENMP=1"
|
||||
- <<: *test-ubuntu
|
||||
os: linux-ppc64le
|
||||
before_script:
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32"
|
||||
env:
|
||||
# for matrix annotation only
|
||||
- TARGET_BOX=PPC64LE_LINUX
|
||||
- BTYPE="BINARY=64 USE_OPENMP=1"
|
||||
|
||||
- <<: *test-ubuntu
|
||||
env:
|
||||
@@ -162,16 +162,24 @@ matrix:
|
||||
before_script:
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
|
||||
- brew update
|
||||
- brew install gcc # for gfortran
|
||||
- brew install gcc@8 # for gfortran
|
||||
script:
|
||||
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
env:
|
||||
- BTYPE="BINARY=64 INTERFACE64=1"
|
||||
- BTYPE="BINARY=64 INTERFACE64=1 FC=gfortran-8"
|
||||
|
||||
- <<: *test-macos
|
||||
osx_image: xcode8.3
|
||||
env:
|
||||
- BTYPE="BINARY=32"
|
||||
- BTYPE="BINARY=32 FC=gfortran-8"
|
||||
|
||||
- <<: *test-macos
|
||||
osx_image: xcode10.1
|
||||
env:
|
||||
- COMMON_FLAGS="NUM_THREADS=32"
|
||||
- CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk"
|
||||
- CFLAGS="-O2 -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0"
|
||||
- BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang"
|
||||
|
||||
# whitelist
|
||||
branches:
|
||||
|
||||
@@ -322,12 +322,13 @@ CCOMMON_OPT += -DMS_ABI
|
||||
endif
|
||||
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
#Test for supporting MS_ABI
|
||||
#Version tests for supporting specific features (MS_ABI, POWER9 intrinsics)
|
||||
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
|
||||
GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4)
|
||||
GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5)
|
||||
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
|
||||
ifeq ($(GCCVERSIONGT4), 1)
|
||||
# GCC Majar version > 4
|
||||
# GCC Major version > 4
|
||||
# It is compatible with MSVC ABI.
|
||||
CCOMMON_OPT += -DMS_ABI
|
||||
endif
|
||||
@@ -554,8 +555,17 @@ endif
|
||||
ifeq ($(ARCH), power)
|
||||
DYNAMIC_CORE = POWER6
|
||||
DYNAMIC_CORE += POWER8
|
||||
ifneq ($(C_COMPILER), GCC)
|
||||
DYNAMIC_CORE += POWER9
|
||||
endif
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
ifeq ($(GCCVERSIONGT5), 1)
|
||||
DYNAMIC_CORE += POWER9
|
||||
else
|
||||
$(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.)
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
# If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty
|
||||
ifndef DYNAMIC_CORE
|
||||
@@ -759,6 +769,9 @@ else
|
||||
FCOMMON_OPT += -m32
|
||||
endif
|
||||
endif
|
||||
ifneq ($(NO_LAPACKE), 1)
|
||||
FCOMMON_OPT += -fno-second-underscore
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
13
c_check
13
c_check
@@ -260,6 +260,19 @@ if ($architecture ne $hostarch) {
|
||||
|
||||
$cross = 1 if ($os ne $hostos);
|
||||
|
||||
# rework cross suffix and architecture if we are on OSX cross-compiling for ARMV8-based IOS
|
||||
# the initial autodetection will have been confused by the command-line arguments to clang
|
||||
# and the cross-compiler apparently still claims to build for x86_64 in its CC -E output
|
||||
if (($os eq "Darwin") && ($cross_suffix ne "")) {
|
||||
my $tmpnam = `xcrun --sdk iphoneos --find clang`;
|
||||
$cross_suffix = substr($tmpnam, 0, rindex($tmpnam, "/")+1 );
|
||||
# this should produce something like $cross_suffix="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/";
|
||||
$cross =1;
|
||||
$architecture = arm64;
|
||||
}
|
||||
|
||||
|
||||
|
||||
$openmp = "" if $ENV{USE_OPENMP} != 1;
|
||||
|
||||
$linker_L = "";
|
||||
|
||||
@@ -73,7 +73,7 @@ if (DYNAMIC_ARCH)
|
||||
endif ()
|
||||
if (NOT NO_AVX512)
|
||||
set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX)
|
||||
string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS ${CMAKE_C_FLAGS})
|
||||
string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
|
||||
endif ()
|
||||
if (DYNAMIC_LIST)
|
||||
set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST})
|
||||
|
||||
@@ -78,7 +78,18 @@ static void __inline blas_lock(volatile BLASULONG *address){
|
||||
|
||||
#define BLAS_LOCK_DEFINED
|
||||
|
||||
#if !defined(OS_DARWIN) && !defined (OS_ANDROID)
|
||||
static __inline BLASULONG rpcc(void){
|
||||
BLASULONG ret = 0;
|
||||
|
||||
__asm__ __volatile__ ("isb; mrs %0,cntvct_el0":"=r"(ret));
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
#define RPCC_DEFINED
|
||||
#define RPCC64BIT
|
||||
#endif
|
||||
|
||||
static inline int blas_quickdivide(blasint x, blasint y){
|
||||
return x / y;
|
||||
@@ -103,12 +114,16 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
||||
|
||||
#if defined(ASSEMBLER) && !defined(NEEDPARAM)
|
||||
|
||||
#define PROLOGUE \
|
||||
.text ;\
|
||||
.align 4 ;\
|
||||
.global REALNAME ;\
|
||||
.type REALNAME, %function ;\
|
||||
.macro PROLOGUE
|
||||
.text ;
|
||||
.p2align 2 ;
|
||||
.global REALNAME ;
|
||||
#ifndef __APPLE__
|
||||
.type REALNAME, %function ;
|
||||
#endif
|
||||
REALNAME:
|
||||
.endm
|
||||
|
||||
|
||||
#define EPILOGUE
|
||||
|
||||
|
||||
@@ -241,7 +241,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
||||
#define HAVE_PREFETCH
|
||||
#endif
|
||||
|
||||
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || ( defined(PPC970) && ( defined(OS_DARWIN) || defined(OS_FREEBSD) ) )
|
||||
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || defined(PPC970)
|
||||
#define DCBT_ARG 0
|
||||
#else
|
||||
#define DCBT_ARG 8
|
||||
|
||||
@@ -194,10 +194,6 @@ int trsm_thread(int mode, BLASLONG m, BLASLONG n,
|
||||
|
||||
int syrk_thread(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG);
|
||||
|
||||
int beta_thread(int mode, BLASLONG m, BLASLONG n,
|
||||
double alpha_r, double alpha_i,
|
||||
void *c, BLASLONG ldc, int (*fuction)());
|
||||
|
||||
int getrf_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k,
|
||||
void *offsetA, BLASLONG lda,
|
||||
void *offsetB, BLASLONG jb,
|
||||
|
||||
@@ -206,6 +206,33 @@ void get_subdirname(void)
|
||||
printf("arm64");
|
||||
}
|
||||
|
||||
void get_cpucount(void)
|
||||
{
|
||||
int n=0;
|
||||
|
||||
#ifdef linux
|
||||
FILE *infile;
|
||||
char buffer[2048], *p,*t;
|
||||
p = (char *) NULL ;
|
||||
|
||||
infile = fopen("/proc/cpuinfo", "r");
|
||||
|
||||
while (fgets(buffer, sizeof(buffer), infile))
|
||||
{
|
||||
|
||||
if (!strncmp("processor", buffer, 9))
|
||||
n++;
|
||||
}
|
||||
|
||||
fclose(infile);
|
||||
|
||||
printf("#define NUM_CORES %d\n",n);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
void get_cpuconfig(void)
|
||||
{
|
||||
|
||||
@@ -309,6 +336,7 @@ void get_cpuconfig(void)
|
||||
printf("#define DTB_SIZE 4096 \n");
|
||||
break;
|
||||
}
|
||||
get_cpucount();
|
||||
}
|
||||
|
||||
|
||||
@@ -351,5 +379,3 @@ void get_features(void)
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
|
||||
20
cpuid_x86.c
20
cpuid_x86.c
@@ -1197,7 +1197,11 @@ int get_cpuname(void){
|
||||
case 3:
|
||||
case 5:
|
||||
case 6:
|
||||
#if defined(__x86_64__) || defined(__amd64__)
|
||||
return CPUTYPE_CORE2;
|
||||
#else
|
||||
return CPUTYPE_PENTIUM2;
|
||||
#endif
|
||||
case 7:
|
||||
case 8:
|
||||
case 10:
|
||||
@@ -1379,8 +1383,8 @@ int get_cpuname(void){
|
||||
break;
|
||||
case 7: // family 6 exmodel 7
|
||||
switch (model) {
|
||||
case 10: // Goldmont Plus
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 10: // Goldmont Plus
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 14: // Ice Lake
|
||||
if(support_avx512())
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
@@ -1427,7 +1431,11 @@ int get_cpuname(void){
|
||||
case 0x5:
|
||||
return CPUTYPE_AMDK6;
|
||||
case 0x6:
|
||||
#if defined(__x86_64__) || defined(__amd64__)
|
||||
return CPUTYPE_BARCELONA;
|
||||
#else
|
||||
return CPUTYPE_ATHLON;
|
||||
#endif
|
||||
case 0xf:
|
||||
switch (exfamily) {
|
||||
case 0:
|
||||
@@ -1810,7 +1818,11 @@ int get_coretype(void){
|
||||
case 4:
|
||||
case 5:
|
||||
case 6:
|
||||
#if defined(__x86_64__) || defined(__amd64__)
|
||||
return CORE_CORE2;
|
||||
#else
|
||||
return CORE_P6;
|
||||
#endif
|
||||
case 7:
|
||||
return CORE_KATMAI;
|
||||
case 8:
|
||||
@@ -2017,7 +2029,11 @@ int get_coretype(void){
|
||||
|
||||
if (vendor == VENDOR_AMD){
|
||||
if (family <= 0x5) return CORE_80486;
|
||||
#if defined(__x86_64__) || defined(__amd64__)
|
||||
if (family <= 0xe) return CORE_BARCELONA;
|
||||
#else
|
||||
if (family <= 0xe) return CORE_ATHLON;
|
||||
#endif
|
||||
if (family == 0xf){
|
||||
if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON;
|
||||
else if (exfamily == 5) return CORE_BOBCAT;
|
||||
|
||||
@@ -30,17 +30,20 @@
|
||||
#define CPU_GENERIC 0
|
||||
#define CPU_Z13 1
|
||||
#define CPU_Z14 2
|
||||
#define CPU_Z15 3
|
||||
|
||||
static char *cpuname[] = {
|
||||
"ZARCH_GENERIC",
|
||||
"Z13",
|
||||
"Z14"
|
||||
"Z14",
|
||||
"Z15"
|
||||
};
|
||||
|
||||
static char *cpuname_lower[] = {
|
||||
"zarch_generic",
|
||||
"z13",
|
||||
"z14"
|
||||
"z14",
|
||||
"z15"
|
||||
};
|
||||
|
||||
int detect(void)
|
||||
@@ -66,6 +69,8 @@ int detect(void)
|
||||
if (strstr(p, "2965")) return CPU_Z13;
|
||||
if (strstr(p, "3906")) return CPU_Z14;
|
||||
if (strstr(p, "3907")) return CPU_Z14;
|
||||
if (strstr(p, "8561")) return CPU_Z14; // fallback z15 to z14
|
||||
if (strstr(p, "8562")) return CPU_Z14; // fallback z15 to z14
|
||||
|
||||
return CPU_GENERIC;
|
||||
}
|
||||
|
||||
@@ -1503,6 +1503,8 @@ C $ ' .' )
|
||||
NC = 0
|
||||
RESET = .TRUE.
|
||||
ERRMAX = RZERO
|
||||
RALS = RONE
|
||||
RBETS = RONE
|
||||
*
|
||||
DO 100 IN = 1, NIDIM
|
||||
N = IDIM( IN )
|
||||
|
||||
@@ -1504,6 +1504,8 @@ C $ ' .' )
|
||||
NC = 0
|
||||
RESET = .TRUE.
|
||||
ERRMAX = RZERO
|
||||
RALS = RONE
|
||||
RBETS = RONE
|
||||
*
|
||||
DO 100 IN = 1, NIDIM
|
||||
N = IDIM( IN )
|
||||
|
||||
@@ -462,11 +462,15 @@ int BLASFUNC(blas_thread_shutdown)(void){
|
||||
|
||||
for(i = 0; i < blas_num_threads - 1; i++){
|
||||
// Could also just use WaitForMultipleObjects
|
||||
WaitForSingleObject(blas_threads[i], 5); //INFINITE);
|
||||
DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 5000);
|
||||
|
||||
#ifndef OS_WINDOWSSTORE
|
||||
// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
|
||||
TerminateThread(blas_threads[i],0);
|
||||
// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
|
||||
if (WAIT_OBJECT_0 != wait_thread_value) {
|
||||
TerminateThread(blas_threads[i],0);
|
||||
}
|
||||
#endif
|
||||
|
||||
CloseHandle(blas_threads[i]);
|
||||
}
|
||||
|
||||
|
||||
@@ -329,7 +329,7 @@ int support_avx512(){
|
||||
if (!support_avx())
|
||||
return 0;
|
||||
cpuid(7, &eax, &ebx, &ecx, &edx);
|
||||
if((ebx & (1<<7)) != 1){
|
||||
if((ebx & (1<<7)) == 0){
|
||||
ret=0; //OS does not even support AVX2
|
||||
}
|
||||
if((ebx & (1<<31)) != 0){
|
||||
|
||||
@@ -37,8 +37,10 @@
|
||||
/*********************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#if (defined OS_LINUX || defined OS_ANDROID)
|
||||
#include <asm/hwcap.h>
|
||||
#include <sys/auxv.h>
|
||||
#endif
|
||||
|
||||
extern gotoblas_t gotoblas_ARMV8;
|
||||
extern gotoblas_t gotoblas_CORTEXA57;
|
||||
@@ -105,13 +107,17 @@ static gotoblas_t *force_coretype(char *coretype) {
|
||||
|
||||
static gotoblas_t *get_coretype(void) {
|
||||
int implementer, variant, part, arch, revision, midr_el1;
|
||||
|
||||
|
||||
#if (defined OS_LINUX || defined OS_ANDROID)
|
||||
if (!(getauxval(AT_HWCAP) & HWCAP_CPUID)) {
|
||||
char coremsg[128];
|
||||
snprintf(coremsg, 128, "Kernel lacks cpuid feature support. Auto detection of core type failed !!!\n");
|
||||
openblas_warning(1, coremsg);
|
||||
return NULL;
|
||||
}
|
||||
#else
|
||||
return NULL;
|
||||
#endif
|
||||
|
||||
get_cpu_ftr(MIDR_EL1, midr_el1);
|
||||
/*
|
||||
|
||||
@@ -3,7 +3,9 @@
|
||||
|
||||
extern gotoblas_t gotoblas_POWER6;
|
||||
extern gotoblas_t gotoblas_POWER8;
|
||||
#if (!defined C_GCC) || (GCC_VERSION >= 60000)
|
||||
extern gotoblas_t gotoblas_POWER9;
|
||||
#endif
|
||||
|
||||
extern void openblas_warning(int verbose, const char *msg);
|
||||
|
||||
@@ -19,7 +21,9 @@ static char *corename[] = {
|
||||
char *gotoblas_corename(void) {
|
||||
if (gotoblas == &gotoblas_POWER6) return corename[1];
|
||||
if (gotoblas == &gotoblas_POWER8) return corename[2];
|
||||
#if (!defined C_GCC) || (GCC_VERSION >= 60000)
|
||||
if (gotoblas == &gotoblas_POWER9) return corename[3];
|
||||
#endif
|
||||
return corename[0];
|
||||
}
|
||||
|
||||
@@ -29,8 +33,10 @@ static gotoblas_t *get_coretype(void) {
|
||||
return &gotoblas_POWER6;
|
||||
if (__builtin_cpu_is("power8"))
|
||||
return &gotoblas_POWER8;
|
||||
#if (!defined C_GCC) || (GCC_VERSION >= 60000)
|
||||
if (__builtin_cpu_is("power9"))
|
||||
return &gotoblas_POWER9;
|
||||
#endif
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@@ -53,7 +59,9 @@ static gotoblas_t *force_coretype(char * coretype) {
|
||||
{
|
||||
case 1: return (&gotoblas_POWER6);
|
||||
case 2: return (&gotoblas_POWER8);
|
||||
#if (!defined C_GCC) || (GCC_VERSION >= 60000)
|
||||
case 3: return (&gotoblas_POWER9);
|
||||
#endif
|
||||
default: return NULL;
|
||||
}
|
||||
snprintf(message, 128, "Core not found: %s\n", coretype);
|
||||
|
||||
@@ -129,7 +129,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
|
||||
#include <sys/sysctl.h>
|
||||
#include <sys/resource.h>
|
||||
#endif
|
||||
@@ -192,7 +192,7 @@ void goto_set_num_threads(int num_threads) {};
|
||||
|
||||
#else
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_SUNOS) || defined(OS_NETBSD)
|
||||
#if defined(OS_LINUX) || defined(OS_SUNOS)
|
||||
#ifndef NO_AFFINITY
|
||||
int get_num_procs(void);
|
||||
#else
|
||||
@@ -312,7 +312,7 @@ int get_num_procs(void) {
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY)
|
||||
|
||||
int get_num_procs(void) {
|
||||
|
||||
@@ -404,7 +404,7 @@ extern int openblas_goto_num_threads_env();
|
||||
extern int openblas_omp_num_threads_env();
|
||||
|
||||
int blas_get_cpu_number(void){
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
int max_num;
|
||||
#endif
|
||||
int blas_goto_num = 0;
|
||||
@@ -412,7 +412,7 @@ int blas_get_cpu_number(void){
|
||||
|
||||
if (blas_num_threads) return blas_num_threads;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
max_num = get_num_procs();
|
||||
#endif
|
||||
|
||||
@@ -436,7 +436,7 @@ int blas_get_cpu_number(void){
|
||||
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
|
||||
else blas_num_threads = MAX_CPU_NUMBER;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
if (blas_num_threads > max_num) blas_num_threads = max_num;
|
||||
#endif
|
||||
|
||||
@@ -1673,7 +1673,7 @@ void gotoblas_dummy_for_PGI(void) {
|
||||
#include <sys/resource.h>
|
||||
#endif
|
||||
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
|
||||
#include <sys/sysctl.h>
|
||||
#include <sys/resource.h>
|
||||
#endif
|
||||
@@ -1736,7 +1736,7 @@ void goto_set_num_threads(int num_threads) {};
|
||||
|
||||
#else
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_SUNOS) || defined(OS_NETBSD)
|
||||
#if defined(OS_LINUX) || defined(OS_SUNOS)
|
||||
#ifndef NO_AFFINITY
|
||||
int get_num_procs(void);
|
||||
#else
|
||||
@@ -1855,7 +1855,7 @@ int get_num_procs(void) {
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY)
|
||||
|
||||
int get_num_procs(void) {
|
||||
|
||||
@@ -1945,7 +1945,7 @@ extern int openblas_goto_num_threads_env();
|
||||
extern int openblas_omp_num_threads_env();
|
||||
|
||||
int blas_get_cpu_number(void){
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
int max_num;
|
||||
#endif
|
||||
int blas_goto_num = 0;
|
||||
@@ -1953,7 +1953,7 @@ int blas_get_cpu_number(void){
|
||||
|
||||
if (blas_num_threads) return blas_num_threads;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
max_num = get_num_procs();
|
||||
#endif
|
||||
|
||||
@@ -1977,7 +1977,7 @@ int blas_get_cpu_number(void){
|
||||
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
|
||||
else blas_num_threads = MAX_CPU_NUMBER;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
if (blas_num_threads > max_num) blas_num_threads = max_num;
|
||||
#endif
|
||||
|
||||
|
||||
@@ -38,21 +38,29 @@
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
#ifndef SMP
|
||||
#define blas_cpu_number 1
|
||||
#else
|
||||
|
||||
int blas_cpu_number = 1;
|
||||
|
||||
int blas_get_cpu_number(void){
|
||||
|
||||
return blas_cpu_number;
|
||||
}
|
||||
#ifdef OS_LINUX
|
||||
#include <sys/sysinfo.h>
|
||||
#include <sched.h>
|
||||
#include <errno.h>
|
||||
#include <linux/unistd.h>
|
||||
#include <sys/syscall.h>
|
||||
#include <sys/time.h>
|
||||
#include <sys/resource.h>
|
||||
#endif
|
||||
|
||||
#ifdef OS_HAIKU
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
|
||||
#include <sys/sysctl.h>
|
||||
#include <sys/resource.h>
|
||||
#endif
|
||||
|
||||
|
||||
#define FIXED_PAGESIZE 4096
|
||||
|
||||
|
||||
void *sa = NULL;
|
||||
void *sb = NULL;
|
||||
static double static_buffer[BUFFER_SIZE/sizeof(double)];
|
||||
@@ -60,7 +68,7 @@ static double static_buffer[BUFFER_SIZE/sizeof(double)];
|
||||
void *blas_memory_alloc(int numproc){
|
||||
|
||||
if (sa == NULL){
|
||||
#if 1
|
||||
#if 0
|
||||
sa = (void *)qalloc(QFAST, BUFFER_SIZE);
|
||||
#else
|
||||
sa = (void *)malloc(BUFFER_SIZE);
|
||||
@@ -75,3 +83,296 @@ void blas_memory_free(void *free_area){
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
|
||||
extern void openblas_warning(int verbose, const char * msg);
|
||||
|
||||
#ifndef SMP
|
||||
|
||||
#define blas_cpu_number 1
|
||||
#define blas_num_threads 1
|
||||
|
||||
/* Dummy Function */
|
||||
int goto_get_num_procs (void) { return 1;};
|
||||
void goto_set_num_threads(int num_threads) {};
|
||||
|
||||
#else
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_SUNOS)
|
||||
#ifndef NO_AFFINITY
|
||||
int get_num_procs(void);
|
||||
#else
|
||||
int get_num_procs(void) {
|
||||
|
||||
static int nums = 0;
|
||||
cpu_set_t cpuset,*cpusetp;
|
||||
size_t size;
|
||||
int ret;
|
||||
|
||||
#if defined(__GLIBC_PREREQ)
|
||||
#if !__GLIBC_PREREQ(2, 7)
|
||||
int i;
|
||||
#if !__GLIBC_PREREQ(2, 6)
|
||||
int n;
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
#if !defined(OS_LINUX)
|
||||
return nums;
|
||||
#endif
|
||||
|
||||
/*
|
||||
#if !defined(__GLIBC_PREREQ)
|
||||
return nums;
|
||||
#else
|
||||
#if !__GLIBC_PREREQ(2, 3)
|
||||
return nums;
|
||||
#endif
|
||||
|
||||
#if !__GLIBC_PREREQ(2, 7)
|
||||
ret = sched_getaffinity(0,sizeof(cpuset), &cpuset);
|
||||
if (ret!=0) return nums;
|
||||
n=0;
|
||||
#if !__GLIBC_PREREQ(2, 6)
|
||||
for (i=0;i<nums;i++)
|
||||
if (CPU_ISSET(i,&cpuset)) n++;
|
||||
nums=n;
|
||||
#else
|
||||
nums = CPU_COUNT(sizeof(cpuset),&cpuset);
|
||||
#endif
|
||||
return nums;
|
||||
#else
|
||||
if (nums >= CPU_SETSIZE) {
|
||||
cpusetp = CPU_ALLOC(nums);
|
||||
if (cpusetp == NULL) {
|
||||
return nums;
|
||||
}
|
||||
size = CPU_ALLOC_SIZE(nums);
|
||||
ret = sched_getaffinity(0,size,cpusetp);
|
||||
if (ret!=0) {
|
||||
CPU_FREE(cpusetp);
|
||||
return nums;
|
||||
}
|
||||
ret = CPU_COUNT_S(size,cpusetp);
|
||||
if (ret > 0 && ret < nums) nums = ret;
|
||||
CPU_FREE(cpusetp);
|
||||
return nums;
|
||||
} else {
|
||||
ret = sched_getaffinity(0,sizeof(cpuset),&cpuset);
|
||||
if (ret!=0) {
|
||||
return nums;
|
||||
}
|
||||
ret = CPU_COUNT(&cpuset);
|
||||
if (ret > 0 && ret < nums) nums = ret;
|
||||
return nums;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
*/
|
||||
return 1;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef OS_ANDROID
|
||||
int get_num_procs(void) {
|
||||
static int nums = 0;
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
return nums;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef OS_HAIKU
|
||||
int get_num_procs(void) {
|
||||
static int nums = 0;
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
return nums;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef OS_AIX
|
||||
int get_num_procs(void) {
|
||||
static int nums = 0;
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
return nums;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef OS_WINDOWS
|
||||
|
||||
int get_num_procs(void) {
|
||||
|
||||
static int nums = 0;
|
||||
|
||||
if (nums == 0) {
|
||||
|
||||
SYSTEM_INFO sysinfo;
|
||||
|
||||
GetSystemInfo(&sysinfo);
|
||||
|
||||
nums = sysinfo.dwNumberOfProcessors;
|
||||
}
|
||||
|
||||
return nums;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY)
|
||||
|
||||
int get_num_procs(void) {
|
||||
|
||||
static int nums = 0;
|
||||
|
||||
int m[2];
|
||||
size_t len;
|
||||
|
||||
if (nums == 0) {
|
||||
m[0] = CTL_HW;
|
||||
m[1] = HW_NCPU;
|
||||
len = sizeof(int);
|
||||
sysctl(m, 2, &nums, &len, NULL, 0);
|
||||
}
|
||||
|
||||
return nums;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(OS_DARWIN)
|
||||
int get_num_procs(void) {
|
||||
static int nums = 0;
|
||||
size_t len;
|
||||
if (nums == 0){
|
||||
len = sizeof(int);
|
||||
sysctlbyname("hw.physicalcpu", &nums, &len, NULL, 0);
|
||||
}
|
||||
return nums;
|
||||
}
|
||||
/*
|
||||
void set_stack_limit(int limitMB){
|
||||
int result=0;
|
||||
struct rlimit rl;
|
||||
rlim_t StackSize;
|
||||
|
||||
StackSize=limitMB*1024*1024;
|
||||
result=getrlimit(RLIMIT_STACK, &rl);
|
||||
if(result==0){
|
||||
if(rl.rlim_cur < StackSize){
|
||||
rl.rlim_cur=StackSize;
|
||||
result=setrlimit(RLIMIT_STACK, &rl);
|
||||
if(result !=0){
|
||||
fprintf(stderr, "OpenBLAS: set stack limit error =%d\n", result);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
#endif
|
||||
|
||||
|
||||
/*
|
||||
OpenBLAS uses the numbers of CPU cores in multithreading.
|
||||
It can be set by openblas_set_num_threads(int num_threads);
|
||||
*/
|
||||
int blas_cpu_number = 0;
|
||||
/*
|
||||
The numbers of threads in the thread pool.
|
||||
This value is equal or large than blas_cpu_number. This means some threads are sleep.
|
||||
*/
|
||||
int blas_num_threads = 0;
|
||||
|
||||
int goto_get_num_procs (void) {
|
||||
return blas_cpu_number;
|
||||
}
|
||||
|
||||
void openblas_fork_handler()
|
||||
{
|
||||
// This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is
|
||||
// built with "make USE_OPENMP=0".
|
||||
// Hanging can still happen when OpenBLAS is built against the libgomp
|
||||
// implementation of OpenMP. The problem is tracked at:
|
||||
// http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035
|
||||
// In the mean time build with USE_OPENMP=0 or link against another
|
||||
// implementation of OpenMP.
|
||||
#if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER)
|
||||
int err;
|
||||
err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL);
|
||||
if(err != 0)
|
||||
openblas_warning(0, "OpenBLAS Warning ... cannot install fork handler. You may meet hang after fork.\n");
|
||||
#endif
|
||||
}
|
||||
|
||||
extern int openblas_num_threads_env();
|
||||
extern int openblas_goto_num_threads_env();
|
||||
extern int openblas_omp_num_threads_env();
|
||||
|
||||
int blas_get_cpu_number(void){
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
int max_num;
|
||||
#endif
|
||||
int blas_goto_num = 0;
|
||||
int blas_omp_num = 0;
|
||||
|
||||
if (blas_num_threads) return blas_num_threads;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
max_num = get_num_procs();
|
||||
#endif
|
||||
|
||||
// blas_goto_num = 0;
|
||||
#ifndef USE_OPENMP
|
||||
blas_goto_num=openblas_num_threads_env();
|
||||
if (blas_goto_num < 0) blas_goto_num = 0;
|
||||
|
||||
if (blas_goto_num == 0) {
|
||||
blas_goto_num=openblas_goto_num_threads_env();
|
||||
if (blas_goto_num < 0) blas_goto_num = 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// blas_omp_num = 0;
|
||||
blas_omp_num=openblas_omp_num_threads_env();
|
||||
if (blas_omp_num < 0) blas_omp_num = 0;
|
||||
|
||||
if (blas_goto_num > 0) blas_num_threads = blas_goto_num;
|
||||
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
|
||||
else blas_num_threads = MAX_CPU_NUMBER;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
if (blas_num_threads > max_num) blas_num_threads = max_num;
|
||||
#endif
|
||||
|
||||
if (blas_num_threads > MAX_CPU_NUMBER) blas_num_threads = MAX_CPU_NUMBER;
|
||||
|
||||
#ifdef DEBUG
|
||||
printf( "Adjusted number of threads : %3d\n", blas_num_threads);
|
||||
#endif
|
||||
|
||||
blas_cpu_number = blas_num_threads;
|
||||
|
||||
return blas_num_threads;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
int openblas_get_num_procs(void) {
|
||||
#ifndef SMP
|
||||
return 1;
|
||||
#else
|
||||
return get_num_procs();
|
||||
#endif
|
||||
}
|
||||
|
||||
int openblas_get_num_threads(void) {
|
||||
#ifndef SMP
|
||||
return 1;
|
||||
#else
|
||||
// init blas_cpu_number if needed
|
||||
blas_get_cpu_number();
|
||||
return blas_cpu_number;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -618,19 +618,6 @@
|
||||
# functions added for lapack-3.7.0
|
||||
|
||||
slarfy,
|
||||
slasyf_rk,
|
||||
ssyconvf_rook,
|
||||
ssytf2_rk,
|
||||
ssytrf_rk,
|
||||
ssytrs_3,
|
||||
ssytri_3,
|
||||
ssytri_3x,
|
||||
ssycon_3,
|
||||
ssysv_rk,
|
||||
slasyf_aa,
|
||||
ssysv_aa,
|
||||
ssytrf_aa,
|
||||
ssytrs_aa,
|
||||
strevc3,
|
||||
sgelqt,
|
||||
sgelqt3,
|
||||
@@ -647,33 +634,8 @@
|
||||
stplqt,
|
||||
stplqt2,
|
||||
stpmlqt,
|
||||
ssytrd_2stage,
|
||||
ssytrd_sy2sb,
|
||||
ssytrd_sb2st,
|
||||
ssb2st_kernels,
|
||||
ssyevd_2stage,
|
||||
ssyev_2stage,
|
||||
ssyevx_2stage,
|
||||
ssyevr_2stage,
|
||||
ssbev_2stage,
|
||||
ssbevx_2stage,
|
||||
ssbevd_2stage,
|
||||
ssygv_2stage,
|
||||
dlarfy,
|
||||
dlasyf_rk,
|
||||
dsyconvf,
|
||||
dsyconvf_rook,
|
||||
dsytf2_rk,
|
||||
dsytrf_rk,
|
||||
dsytrs_3,
|
||||
dsytri_3,
|
||||
dsytri_3x,
|
||||
dsycon_3,
|
||||
dsysv_rk,
|
||||
dlasyf_aa,
|
||||
dsysv_aa,
|
||||
dsytrf_aa,
|
||||
dsytrs_aa,
|
||||
dtrevc3,
|
||||
dgelqt,
|
||||
dgelqt3,
|
||||
@@ -690,45 +652,8 @@
|
||||
dtplqt,
|
||||
dtplqt2,
|
||||
dtpmlqt,
|
||||
dsytrd_2stage,
|
||||
dsytrd_sy2sb,
|
||||
dsytrd_sb2st,
|
||||
dsb2st_kernels,
|
||||
dsyevd_2stage,
|
||||
dsyev_2stage,
|
||||
dsyevx_2stage,
|
||||
dsyevr_2stage,
|
||||
dsbev_2stage,
|
||||
dsbevx_2stage,
|
||||
dsbevd_2stage,
|
||||
dsygv_2stage,
|
||||
chetf2_rk,
|
||||
chetrf_rk,
|
||||
chetri_3,
|
||||
chetri_3x,
|
||||
chetrs_3,
|
||||
checon_3,
|
||||
chesv_rk,
|
||||
chesv_aa,
|
||||
chetrf_aa,
|
||||
chetrs_aa,
|
||||
clahef_aa,
|
||||
clahef_rk,
|
||||
clarfy,
|
||||
clasyf_rk,
|
||||
clasyf_aa,
|
||||
csyconvf,
|
||||
csyconvf_rook,
|
||||
csytf2_rk,
|
||||
csytrf_rk,
|
||||
csytrf_aa,
|
||||
csytrs_3,
|
||||
csytrs_aa,
|
||||
csytri_3,
|
||||
csytri_3x,
|
||||
csycon_3,
|
||||
csysv_rk,
|
||||
csysv_aa,
|
||||
ctrevc3,
|
||||
cgelqt,
|
||||
cgelqt3,
|
||||
@@ -745,45 +670,8 @@
|
||||
ctplqt,
|
||||
ctplqt2,
|
||||
ctpmlqt,
|
||||
chetrd_2stage,
|
||||
chetrd_he2hb,
|
||||
chetrd_hb2st,
|
||||
chb2st_kernels,
|
||||
cheevd_2stage,
|
||||
cheev_2stage,
|
||||
cheevx_2stage,
|
||||
cheevr_2stage,
|
||||
chbev_2stage,
|
||||
chbevx_2stage,
|
||||
chbevd_2stage,
|
||||
chegv_2stage,
|
||||
zhetf2_rk,
|
||||
zhetrf_rk,
|
||||
zhetri_3,
|
||||
zhetri_3x,
|
||||
zhetrs_3,
|
||||
zhecon_3,
|
||||
zhesv_rk,
|
||||
zhesv_aa,
|
||||
zhetrf_aa,
|
||||
zhetrs_aa,
|
||||
zlahef_aa,
|
||||
zlahef_rk,
|
||||
zlarfy,
|
||||
zlasyf_rk,
|
||||
zlasyf_aa,
|
||||
zsyconvf,
|
||||
zsyconvf_rook,
|
||||
zsytrs_aa,
|
||||
zsytf2_rk,
|
||||
zsytrf_rk,
|
||||
zsytrf_aa,
|
||||
zsytrs_3,
|
||||
zsytri_3,
|
||||
zsytri_3x,
|
||||
zsycon_3,
|
||||
zsysv_rk,
|
||||
zsysv_aa,
|
||||
ztrevc3,
|
||||
ztplqt,
|
||||
ztplqt2,
|
||||
@@ -800,43 +688,13 @@
|
||||
zlaswlq,
|
||||
zlamswlq,
|
||||
zgemlq,
|
||||
zhetrd_2stage,
|
||||
zhetrd_he2hb,
|
||||
zhetrd_hb2st,
|
||||
zhb2st_kernels,
|
||||
zheevd_2stage,
|
||||
zheev_2stage,
|
||||
zheevx_2stage,
|
||||
zheevr_2stage,
|
||||
zhbev_2stage,
|
||||
zhbevx_2stage,
|
||||
zhbevd_2stage,
|
||||
zhegv_2stage,
|
||||
sladiv1,
|
||||
dladiv1,
|
||||
iparam2stage,
|
||||
|
||||
# functions added for lapack-3.8.0
|
||||
|
||||
ilaenv2stage,
|
||||
ssysv_aa_2stage,
|
||||
ssytrf_aa_2stage,
|
||||
ssytrs_aa_2stage,
|
||||
chesv_aa_2stage,
|
||||
chetrf_aa_2stage,
|
||||
chetrs_aa_2stage,
|
||||
csysv_aa_2stage,
|
||||
csytrf_aa_2stage,
|
||||
csytrs_aa_2stage,
|
||||
dsysv_aa_2stage,
|
||||
dsytrf_aa_2stage,
|
||||
dsytrs_aa_2stage,
|
||||
zhesv_aa_2stage,
|
||||
zhetrf_aa_2stage,
|
||||
zhetrs_aa_2stage,
|
||||
zsysv_aa_2stage,
|
||||
zsytrf_aa_2stage,
|
||||
zsytrs_aa_2stage
|
||||
ilaenv2stage
|
||||
);
|
||||
|
||||
@lapack_extendedprecision_objs = (
|
||||
@@ -3509,6 +3367,59 @@
|
||||
zlahef_rook, zlasyf_rook,
|
||||
zsytf2_rook, zsytrf_rook, zsytrs_rook,
|
||||
zsytri_rook, zsycon_rook, zsysv_rook,
|
||||
# 3.7.0
|
||||
slasyf_rk, ssyconvf_rook, ssytf2_rk,
|
||||
ssytrf_rk, ssytrs_3, ssytri_3,
|
||||
ssytri_3x, ssycon_3, ssysv_rk,
|
||||
slasyf_aa, ssysv_aa, ssytrf_aa,
|
||||
ssytrs_aa, ssytrd_2stage, ssytrd_sy2sb,
|
||||
ssytrd_sb2st, ssb2st_kernels, ssyevd_2stage,
|
||||
ssyev_2stage, ssyevx_2stage, ssyevr_2stage,
|
||||
ssbev_2stage, ssbevx_2stage, ssbevd_2stage,
|
||||
ssygv_2stage, dlasyf_rk, dsyconvf_rook,
|
||||
dsytf2_rk, dsytrf_rk, dsytrs_3,
|
||||
dsytri_3, dsytri_3x, dsycon_3,
|
||||
dsysv_rk, dlasyf_aa, dsysv_aa,
|
||||
dsytrf_aa, dsytrs_aa, dsytrd_2stage,
|
||||
dsytrd_sy2sb, dsytrd_sb2st, dsb2st_kernels,
|
||||
dsyevd_2stage, dsyev_2stage, dsyevx_2stage,
|
||||
dsyevr_2stage, dsbev_2stage, dsbevx_2stage,
|
||||
dsbevd_2stage, dsygv_2stage, chetf2_rk,
|
||||
chetrf_rk, chetri_3, chetri_3x,
|
||||
chetrs_3, checon_3, chesv_rk,
|
||||
chesv_aa, chetrf_aa, chetrs_aa,
|
||||
clahef_aa, clahef_rk, clasyf_rk,
|
||||
clasyf_aa, csytf2_rk, csytrf_rk,
|
||||
csytrf_aa, csytrs_3, csytrs_aa,
|
||||
csytri_3, csytri_3x, csycon_3,
|
||||
csysv_rk, csysv_aa, csyconvf_rook,
|
||||
chetrd_2stage, chetrd_he2hb, chetrd_hb2st,
|
||||
chb2st_kernels, cheevd_2stage, cheev_2stage,
|
||||
cheevx_2stage, cheevr_2stage, chbev_2stage,
|
||||
chbevx_2stage, chbevd_2stage, chegv_2stage,
|
||||
zhetf2_rk, zhetrf_rk, zhetri_3,
|
||||
zhetri_3x, zhetrs_3, zhecon_3,
|
||||
zhesv_rk, zhesv_aa, zhetrf_aa,
|
||||
zhetrs_aa, zlahef_aa, zlahef_rk,
|
||||
zlasyf_rk, zlasyf_aa, zsyconvf_rook,
|
||||
zsytrs_aa, zsytf2_rk, zsytrf_rk,
|
||||
zsytrf_aa, zsytrs_3, zsytri_3,
|
||||
zsytri_3x, zsycon_3, zsysv_rk,
|
||||
zsysv_aa, zhetrd_2stage, zhetrd_he2hb,
|
||||
zhetrd_hb2st, zhb2st_kernels, zheevd_2stage,
|
||||
zheev_2stage, zheevx_2stage, zheevr_2stage,
|
||||
zhbev_2stage, zhbevx_2stage, zhbevd_2stage,
|
||||
zhegv_2stage,
|
||||
# 3.8.0
|
||||
ssysv_aa_2stage, ssytrf_aa_2stage,
|
||||
ssytrs_aa_2stage, chesv_aa_2stage,
|
||||
chetrf_aa_2stage, chetrs_aa_2stage,
|
||||
csysv_aa_2stage, csytrf_aa_2stage,
|
||||
csytrs_aa_2stage, dsysv_aa_2stage,
|
||||
dsytrf_aa_2stage, dsytrs_aa_2stage,
|
||||
zhesv_aa_2stage, zhetrf_aa_2stage,
|
||||
zhetrs_aa_2stage, zsysv_aa_2stage,
|
||||
zsytrf_aa_2stage, zsytrs_aa_2stage
|
||||
);
|
||||
|
||||
|
||||
|
||||
9
f_check
9
f_check
@@ -19,7 +19,7 @@ $nofortran = 0;
|
||||
|
||||
$compiler = join(" ", @ARGV);
|
||||
$compiler_bin = shift(@ARGV);
|
||||
|
||||
|
||||
# f77 is too ambiguous
|
||||
$compiler = "" if $compiler eq "f77";
|
||||
|
||||
@@ -130,6 +130,11 @@ if ($compiler eq "") {
|
||||
if ($data =~ / zho_ge__/) {
|
||||
$need2bu = 1;
|
||||
}
|
||||
if ($vendor =~ /G95/) {
|
||||
if ($ENV{NO_LAPACKE} != 1) {
|
||||
$need2bu = "";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ($vendor eq "") {
|
||||
@@ -277,6 +282,8 @@ $linker_a = "";
|
||||
if ($link ne "") {
|
||||
|
||||
$link =~ s/\-Y\sP\,/\-Y/g;
|
||||
|
||||
$link =~ s/\-R\s*/\-rpath\@/g;
|
||||
|
||||
$link =~ s/\-rpath\s+/\-rpath\@/g;
|
||||
|
||||
|
||||
@@ -82,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#ifdef OS_WINDOWS
|
||||
#include <windows.h>
|
||||
#endif
|
||||
#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||
#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||
#include <sys/types.h>
|
||||
#include <sys/sysctl.h>
|
||||
#endif
|
||||
@@ -1201,7 +1201,7 @@ static int get_num_cores(void) {
|
||||
|
||||
#ifdef OS_WINDOWS
|
||||
SYSTEM_INFO sysinfo;
|
||||
#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||
#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||
int m[2], count;
|
||||
size_t len;
|
||||
#endif
|
||||
@@ -1215,7 +1215,7 @@ static int get_num_cores(void) {
|
||||
GetSystemInfo(&sysinfo);
|
||||
return sysinfo.dwNumberOfProcessors;
|
||||
|
||||
#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||
#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||
m[0] = CTL_HW;
|
||||
m[1] = HW_NCPU;
|
||||
len = sizeof(int);
|
||||
|
||||
@@ -24,9 +24,11 @@ ifeq ($(TARGET), LOONGSON3B)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), GENERIC)
|
||||
ifneq ($(DYNAMIC_ARCH), 1)
|
||||
ifeq ($(TARGET), GENERIC)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), HASWELL)
|
||||
USE_TRMM = 1
|
||||
|
||||
@@ -91,12 +91,10 @@ IDAMAXKERNEL = iamax.S
|
||||
ICAMAXKERNEL = izamax.S
|
||||
IZAMAXKERNEL = izamax.S
|
||||
|
||||
ifneq ($(OS_DARWIN)$(CROSS),11)
|
||||
SNRM2KERNEL = nrm2.S
|
||||
DNRM2KERNEL = nrm2.S
|
||||
CNRM2KERNEL = znrm2.S
|
||||
ZNRM2KERNEL = znrm2.S
|
||||
endif
|
||||
|
||||
DDOTKERNEL = dot.S
|
||||
SDOTKERNEL = dot.S
|
||||
@@ -104,38 +102,6 @@ CDOTKERNEL = zdot.S
|
||||
ZDOTKERNEL = zdot.S
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
ifeq ($(OS_DARWIN)$(CROSS),11)
|
||||
|
||||
STRMMKERNEL = ../generic/trmmkernel_2x2.c
|
||||
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
|
||||
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
|
||||
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
else
|
||||
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||
@@ -202,5 +168,3 @@ ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
endif
|
||||
|
||||
@@ -54,37 +54,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#if !defined(DOUBLE)
|
||||
ldr s4, [X], #4
|
||||
fcmp s4, REGZERO
|
||||
beq KERNEL_F1_NEXT_\@
|
||||
beq 2f /* KERNEL_F1_NEXT_\@ */
|
||||
fabs s4, s4
|
||||
fcmp SCALE, s4
|
||||
bge KERNEL_F1_SCALE_GE_X_\@
|
||||
bge 1f /* KERNEL_F1_SCALE_GE_X_\@ */
|
||||
fdiv s2, SCALE, s4
|
||||
fmul s2, s2, s2
|
||||
fmul s3, SSQ, s2
|
||||
fadd SSQ, REGONE, s3
|
||||
fmov SCALE, s4
|
||||
b KERNEL_F1_NEXT_\@
|
||||
KERNEL_F1_SCALE_GE_X_\@:
|
||||
b 2f /* KERNEL_F1_NEXT_\@ */
|
||||
1: /* KERNEL_F1_SCALE_GE_X_\@: */
|
||||
fdiv s2, s4, SCALE
|
||||
fmla SSQ, s2, v2.s[0]
|
||||
#else
|
||||
ldr d4, [X], #8
|
||||
fcmp d4, REGZERO
|
||||
beq KERNEL_F1_NEXT_\@
|
||||
beq 2f /* KERNEL_F1_NEXT_\@ */
|
||||
fabs d4, d4
|
||||
fcmp SCALE, d4
|
||||
bge KERNEL_F1_SCALE_GE_X_\@
|
||||
bge 1f /* KERNEL_F1_SCALE_GE_X_\@ */
|
||||
fdiv d2, SCALE, d4
|
||||
fmul d2, d2, d2
|
||||
fmul d3, SSQ, d2
|
||||
fadd SSQ, REGONE, d3
|
||||
fmov SCALE, d4
|
||||
b KERNEL_F1_NEXT_\@
|
||||
KERNEL_F1_SCALE_GE_X_\@:
|
||||
b 2f /* KERNEL_F1_NEXT_\@ */
|
||||
1: /* KERNEL_F1_SCALE_GE_X_\@: */
|
||||
fdiv d2, d4, SCALE
|
||||
fmla SSQ, d2, v2.d[0]
|
||||
#endif
|
||||
KERNEL_F1_NEXT_\@:
|
||||
2: /* KERNEL_F1_NEXT_\@: */
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
@@ -54,138 +54,138 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#if !defined(DOUBLE)
|
||||
ldr s4, [X], #4
|
||||
fcmp s4, REGZERO
|
||||
beq KERNEL_F1_NEXT_\@
|
||||
beq 2f /* KERNEL_F1_NEXT_\@ */
|
||||
fabs s4, s4
|
||||
fcmp SCALE, s4
|
||||
bge KERNEL_F1_SCALE_GE_XR_\@
|
||||
bge 1f /* KERNEL_F1_SCALE_GE_XR_\@ */
|
||||
fdiv s2, SCALE, s4
|
||||
fmul s2, s2, s2
|
||||
fmul s3, SSQ, s2
|
||||
fadd SSQ, REGONE, s3
|
||||
fmov SCALE, s4
|
||||
b KERNEL_F1_NEXT_\@
|
||||
KERNEL_F1_SCALE_GE_XR_\@:
|
||||
b 2f /* KERNEL_F1_NEXT_\@ */
|
||||
1: /* KERNEL_F1_SCALE_GE_XR_\@: */
|
||||
fdiv s2, s4, SCALE
|
||||
fmla SSQ, s2, v2.s[0]
|
||||
KERNEL_F1_NEXT_\@:
|
||||
2: /* KERNEL_F1_NEXT_\@: */
|
||||
ldr s5, [X], #4
|
||||
fcmp s5, REGZERO
|
||||
beq KERNEL_F1_END_\@
|
||||
beq 4f /* KERNEL_F1_END_\@ */
|
||||
fabs s5, s5
|
||||
fcmp SCALE, s5
|
||||
bge KERNEL_F1_SCALE_GE_XI_\@
|
||||
bge 3f /* KERNEL_F1_SCALE_GE_XI_\@ */
|
||||
fdiv s2, SCALE, s5
|
||||
fmul s2, s2, s2
|
||||
fmul s3, SSQ, s2
|
||||
fadd SSQ, REGONE, s3
|
||||
fmov SCALE, s5
|
||||
b KERNEL_F1_END_\@
|
||||
KERNEL_F1_SCALE_GE_XI_\@:
|
||||
b 4f /* KERNEL_F1_END_\@ */
|
||||
3: /* KERNEL_F1_SCALE_GE_XI_\@: */
|
||||
fdiv s2, s5, SCALE
|
||||
fmla SSQ, s2, v2.s[0]
|
||||
#else
|
||||
ldr d4, [X], #8
|
||||
fcmp d4, REGZERO
|
||||
beq KERNEL_F1_NEXT_\@
|
||||
beq 2f /* KERNEL_F1_NEXT_\@ */
|
||||
fabs d4, d4
|
||||
fcmp SCALE, d4
|
||||
bge KERNEL_F1_SCALE_GE_XR_\@
|
||||
bge 1f /* KERNEL_F1_SCALE_GE_XR_\@ */
|
||||
fdiv d2, SCALE, d4
|
||||
fmul d2, d2, d2
|
||||
fmul d3, SSQ, d2
|
||||
fadd SSQ, REGONE, d3
|
||||
fmov SCALE, d4
|
||||
b KERNEL_F1_NEXT_\@
|
||||
KERNEL_F1_SCALE_GE_XR_\@:
|
||||
b 2f /* KERNEL_F1_NEXT_\@ */
|
||||
1: /* KERNEL_F1_SCALE_GE_XR_\@: */
|
||||
fdiv d2, d4, SCALE
|
||||
fmla SSQ, d2, v2.d[0]
|
||||
KERNEL_F1_NEXT_\@:
|
||||
2: /* KERNEL_F1_NEXT_\@: */
|
||||
ldr d5, [X], #8
|
||||
fcmp d5, REGZERO
|
||||
beq KERNEL_F1_END_\@
|
||||
beq 4f /* KERNEL_F1_END_\@ */
|
||||
fabs d5, d5
|
||||
fcmp SCALE, d5
|
||||
bge KERNEL_F1_SCALE_GE_XI_\@
|
||||
bge 3f /* KERNEL_F1_SCALE_GE_XI_\@ */
|
||||
fdiv d2, SCALE, d5
|
||||
fmul d2, d2, d2
|
||||
fmul d3, SSQ, d2
|
||||
fadd SSQ, REGONE, d3
|
||||
fmov SCALE, d5
|
||||
b KERNEL_F1_END_\@
|
||||
KERNEL_F1_SCALE_GE_XI_\@:
|
||||
b 4f /* KERNEL_F1_END_\@ */
|
||||
3: /* KERNEL_F1_SCALE_GE_XI_\@: */
|
||||
fdiv d2, d5, SCALE
|
||||
fmla SSQ, d2, v2.d[0]
|
||||
#endif
|
||||
KERNEL_F1_END_\@:
|
||||
4: /* KERNEL_F1_END_\@: */
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
#if !defined(DOUBLE)
|
||||
ldr s4, [X]
|
||||
fcmp s4, REGZERO
|
||||
beq KERNEL_S1_NEXT_\@
|
||||
beq KERNEL_S1_NEXT
|
||||
fabs s4, s4
|
||||
fcmp SCALE, s4
|
||||
bge KERNEL_S1_SCALE_GE_XR_\@
|
||||
bge KERNEL_S1_SCALE_GE_XR
|
||||
fdiv s2, SCALE, s4
|
||||
fmul s2, s2, s2
|
||||
fmul s3, SSQ, s2
|
||||
fadd SSQ, REGONE, s3
|
||||
fmov SCALE, s4
|
||||
b KERNEL_S1_NEXT_\@
|
||||
KERNEL_S1_SCALE_GE_XR_\@:
|
||||
b KERNEL_S1_NEXT
|
||||
KERNEL_S1_SCALE_GE_XR:
|
||||
fdiv s2, s4, SCALE
|
||||
fmla SSQ, s2, v2.s[0]
|
||||
KERNEL_S1_NEXT_\@:
|
||||
KERNEL_S1_NEXT:
|
||||
ldr s5, [X, #4]
|
||||
fcmp s5, REGZERO
|
||||
beq KERNEL_S1_END_\@
|
||||
beq KERNEL_S1_END
|
||||
fabs s5, s5
|
||||
fcmp SCALE, s5
|
||||
bge KERNEL_S1_SCALE_GE_XI_\@
|
||||
bge KERNEL_S1_SCALE_GE_XI
|
||||
fdiv s2, SCALE, s5
|
||||
fmul s2, s2, s2
|
||||
fmul s3, SSQ, s2
|
||||
fadd SSQ, REGONE, s3
|
||||
fmov SCALE, s5
|
||||
b KERNEL_S1_END_\@
|
||||
KERNEL_S1_SCALE_GE_XI_\@:
|
||||
b KERNEL_S1_END
|
||||
KERNEL_S1_SCALE_GE_XI:
|
||||
fdiv s2, s5, SCALE
|
||||
fmla SSQ, s2, v2.s[0]
|
||||
#else
|
||||
ldr d4, [X]
|
||||
fcmp d4, REGZERO
|
||||
beq KERNEL_S1_NEXT_\@
|
||||
beq KERNEL_S1_NEXT
|
||||
fabs d4, d4
|
||||
fcmp SCALE, d4
|
||||
bge KERNEL_S1_SCALE_GE_XR_\@
|
||||
bge KERNEL_S1_SCALE_GE_XR
|
||||
fdiv d2, SCALE, d4
|
||||
fmul d2, d2, d2
|
||||
fmul d3, SSQ, d2
|
||||
fadd SSQ, REGONE, d3
|
||||
fmov SCALE, d4
|
||||
b KERNEL_S1_NEXT_\@
|
||||
KERNEL_S1_SCALE_GE_XR_\@:
|
||||
b KERNEL_S1_NEXT
|
||||
KERNEL_S1_SCALE_GE_XR:
|
||||
fdiv d2, d4, SCALE
|
||||
fmla SSQ, d2, v2.d[0]
|
||||
KERNEL_S1_NEXT_\@:
|
||||
KERNEL_S1_NEXT:
|
||||
ldr d5, [X, #8]
|
||||
fcmp d5, REGZERO
|
||||
beq KERNEL_S1_END_\@
|
||||
beq KERNEL_S1_END
|
||||
fabs d5, d5
|
||||
fcmp SCALE, d5
|
||||
bge KERNEL_S1_SCALE_GE_XI_\@
|
||||
bge KERNEL_S1_SCALE_GE_XI
|
||||
fdiv d2, SCALE, d5
|
||||
fmul d2, d2, d2
|
||||
fmul d3, SSQ, d2
|
||||
fadd SSQ, REGONE, d3
|
||||
fmov SCALE, d5
|
||||
b KERNEL_S1_END_\@
|
||||
KERNEL_S1_SCALE_GE_XI_\@:
|
||||
b KERNEL_S1_END
|
||||
KERNEL_S1_SCALE_GE_XI:
|
||||
fdiv d2, d5, SCALE
|
||||
fmla SSQ, d2, v2.d[0]
|
||||
#endif
|
||||
KERNEL_S1_END_\@:
|
||||
KERNEL_S1_END:
|
||||
add X, X, INC_X
|
||||
.endm
|
||||
|
||||
|
||||
@@ -89,14 +89,30 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
#SMINKERNEL = ../arm/min.c
|
||||
#DMINKERNEL = ../arm/min.c
|
||||
#
|
||||
ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
|
||||
ISAMAXKERNEL = isamax_power8.S
|
||||
else
|
||||
ISAMAXKERNEL = isamax.c
|
||||
endif
|
||||
IDAMAXKERNEL = idamax.c
|
||||
ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
|
||||
ICAMAXKERNEL = icamax_power8.S
|
||||
else
|
||||
ICAMAXKERNEL = icamax.c
|
||||
endif
|
||||
IZAMAXKERNEL = izamax.c
|
||||
#
|
||||
ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
|
||||
ISAMINKERNEL = isamin_power8.S
|
||||
else
|
||||
ISAMINKERNEL = isamin.c
|
||||
endif
|
||||
IDAMINKERNEL = idamin.c
|
||||
ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
|
||||
ICAMINKERNEL = icamin_power8.S
|
||||
else
|
||||
ICAMINKERNEL = icamin.c
|
||||
endif
|
||||
IZAMINKERNEL = izamin.c
|
||||
#
|
||||
#ISMAXKERNEL = ../arm/imax.c
|
||||
@@ -112,7 +128,11 @@ ZASUMKERNEL = zasum.c
|
||||
#
|
||||
SAXPYKERNEL = saxpy.c
|
||||
DAXPYKERNEL = daxpy.c
|
||||
ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
|
||||
CAXPYKERNEL = caxpy_power8.S
|
||||
else
|
||||
CAXPYKERNEL = caxpy.c
|
||||
endif
|
||||
ZAXPYKERNEL = zaxpy.c
|
||||
#
|
||||
SCOPYKERNEL = scopy.c
|
||||
|
||||
@@ -89,14 +89,14 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
#SMINKERNEL = ../arm/min.c
|
||||
#DMINKERNEL = ../arm/min.c
|
||||
#
|
||||
ISAMAXKERNEL = isamax.c
|
||||
ISAMAXKERNEL = isamax_power9.S
|
||||
IDAMAXKERNEL = idamax.c
|
||||
ICAMAXKERNEL = icamax.c
|
||||
ICAMAXKERNEL = icamax_power9.S
|
||||
IZAMAXKERNEL = izamax.c
|
||||
#
|
||||
ISAMINKERNEL = isamin.c
|
||||
ISAMINKERNEL = isamin_power9.S
|
||||
IDAMINKERNEL = idamin.c
|
||||
ICAMINKERNEL = icamin.c
|
||||
ICAMINKERNEL = icamin_power9.S
|
||||
IZAMINKERNEL = izamin.c
|
||||
#
|
||||
#ISMAXKERNEL = ../arm/imax.c
|
||||
@@ -112,7 +112,7 @@ ZASUMKERNEL = zasum.c
|
||||
#
|
||||
SAXPYKERNEL = saxpy.c
|
||||
DAXPYKERNEL = daxpy.c
|
||||
CAXPYKERNEL = caxpy.c
|
||||
CAXPYKERNEL = caxpy_power9.S
|
||||
ZAXPYKERNEL = zaxpy.c
|
||||
#
|
||||
SCOPYKERNEL = scopy.c
|
||||
@@ -123,7 +123,7 @@ ZCOPYKERNEL = zcopy.c
|
||||
SDOTKERNEL = sdot.c
|
||||
DDOTKERNEL = ddot.c
|
||||
DSDOTKERNEL = sdot.c
|
||||
CDOTKERNEL = cdot.c
|
||||
CDOTKERNEL = cdot_power9.S
|
||||
ZDOTKERNEL = zdot.c
|
||||
#
|
||||
SNRM2KERNEL = ../arm/nrm2.c
|
||||
|
||||
@@ -15,13 +15,23 @@ ZASUMKERNEL = zasum_ppc440.S
|
||||
|
||||
SAXPYKERNEL = axpy_ppc440.S
|
||||
DAXPYKERNEL = axpy_ppc440.S
|
||||
ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
|
||||
CAXPYKERNEL = ../arm/zaxpy.c
|
||||
ZAXPYKERNEL = ../arm/zaxpy.c
|
||||
else
|
||||
CAXPYKERNEL = zaxpy_ppc440.S
|
||||
ZAXPYKERNEL = zaxpy_ppc440.S
|
||||
endif
|
||||
|
||||
SDOTKERNEL = dot_ppc440.S
|
||||
DDOTKERNEL = dot_ppc440.S
|
||||
ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
|
||||
CDOTKERNEL = zdot_ppc440.S
|
||||
ZDOTKERNEL = zdot_ppc440.S
|
||||
else
|
||||
CDOTKERNEL = ../arm/zdot.c
|
||||
ZDOTKERNEL = ../arm/zdot.c
|
||||
endif
|
||||
|
||||
ISAMAXKERNEL = iamax_ppc440.S
|
||||
IDAMAXKERNEL = iamax_ppc440.S
|
||||
@@ -52,8 +62,13 @@ ZNRM2KERNEL = znrm2_ppc440.S
|
||||
|
||||
SROTKERNEL = rot_ppc440.S
|
||||
DROTKERNEL = rot_ppc440.S
|
||||
ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
|
||||
CROTKERNEL = zrot_ppc440.S
|
||||
ZROTKERNEL = zrot_ppc440.S
|
||||
else
|
||||
CROTKERNEL = ../arm/zrot.c
|
||||
ZROTKERNEL = ../arm/zrot.c
|
||||
endif
|
||||
|
||||
SSCALKERNEL = scal_ppc440.S
|
||||
DSCALKERNEL = scal_ppc440.S
|
||||
@@ -116,3 +131,15 @@ ZTRSMKERNEL_LN = ztrsm_kernel_ppc440_LN.S
|
||||
ZTRSMKERNEL_LT = ztrsm_kernel_ppc440_LT.S
|
||||
ZTRSMKERNEL_RN = ztrsm_kernel_ppc440_LT.S
|
||||
ZTRSMKERNEL_RT = ztrsm_kernel_ppc440_RT.S
|
||||
|
||||
ifeq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
|
||||
SGEMVNKERNEL = ../arm/gemv_n.c
|
||||
DGEMVNKERNEL = ../arm/gemv_n.c
|
||||
SGEMVTKERNEL = ../arm/gemv_t.c
|
||||
DGEMVTKERNEL = ../arm/gemv_t.c
|
||||
CGEMVNKERNEL = ../arm/zgemv_n.c
|
||||
ZGEMVNKERNEL = ../arm/zgemv_n.c
|
||||
CGEMVTKERNEL = ../arm/zgemv_t.c
|
||||
ZGEMVTKERNEL = ../arm/zgemv_t.c
|
||||
endif
|
||||
|
||||
|
||||
@@ -1,3 +1,14 @@
|
||||
ifeq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
|
||||
SGEMMKERNEL = gemm_kernel.S
|
||||
SGEMMINCOPY =
|
||||
SGEMMITCOPY =
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
SGEMMINCOPYOBJ =
|
||||
SGEMMITCOPYOBJ =
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
else
|
||||
SGEMMKERNEL = gemm_kernel_altivec.S
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
|
||||
@@ -7,6 +18,8 @@ SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
DGEMMKERNEL = gemm_kernel.S
|
||||
DGEMMINCOPY =
|
||||
DGEMMITCOPY =
|
||||
@@ -16,6 +29,18 @@ DGEMMINCOPYOBJ =
|
||||
DGEMMITCOPYOBJ =
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ifeq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
|
||||
CGEMMKERNEL = zgemm_kernel.S
|
||||
CGEMMINCOPY =
|
||||
CGEMMITCOPY =
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
CGEMMINCOPYOBJ =
|
||||
CGEMMITCOPYOBJ =
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
else
|
||||
CGEMMKERNEL = zgemm_kernel_altivec.S
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
|
||||
@@ -25,6 +50,8 @@ CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel.S
|
||||
ZGEMMINCOPY =
|
||||
ZGEMMITCOPY =
|
||||
@@ -35,22 +62,30 @@ ZGEMMITCOPYOBJ =
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
#STRSMKERNEL_LN = trsm_kernel_LN.S
|
||||
#STRSMKERNEL_LT = trsm_kernel_LT.S
|
||||
#STRSMKERNEL_RN = trsm_kernel_LT.S
|
||||
#STRSMKERNEL_RT = trsm_kernel_RT.S
|
||||
|
||||
DTRSMKERNEL_LN = trsm_kernel_LN.S
|
||||
DTRSMKERNEL_LT = trsm_kernel_LT.S
|
||||
DTRSMKERNEL_RN = trsm_kernel_LT.S
|
||||
DTRSMKERNEL_RT = trsm_kernel_RT.S
|
||||
|
||||
#CTRSMKERNEL_LN = ztrsm_kernel_LN.S
|
||||
#CTRSMKERNEL_LT = ztrsm_kernel_LT.S
|
||||
#CTRSMKERNEL_RN = ztrsm_kernel_LT.S
|
||||
#CTRSMKERNEL_RT = ztrsm_kernel_RT.S
|
||||
|
||||
ZTRSMKERNEL_LN = ztrsm_kernel_LN.S
|
||||
ZTRSMKERNEL_LT = ztrsm_kernel_LT.S
|
||||
ZTRSMKERNEL_RN = ztrsm_kernel_LT.S
|
||||
ZTRSMKERNEL_RT = ztrsm_kernel_RT.S
|
||||
|
||||
ifeq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
|
||||
STRSMKERNEL_LN = trsm_kernel_LN.S
|
||||
STRSMKERNEL_LT = trsm_kernel_LT.S
|
||||
STRSMKERNEL_RN = trsm_kernel_LT.S
|
||||
STRSMKERNEL_RT = trsm_kernel_RT.S
|
||||
|
||||
CTRSMKERNEL_LN = ztrsm_kernel_LN.S
|
||||
CTRSMKERNEL_LT = ztrsm_kernel_LT.S
|
||||
CTRSMKERNEL_RN = ztrsm_kernel_LT.S
|
||||
CTRSMKERNEL_RT = ztrsm_kernel_RT.S
|
||||
|
||||
|
||||
SROTKERNEL = ../arm/rot.c
|
||||
DROTKERNEL = ../arm/rot.c
|
||||
CROTKERNEL = ../arm/zrot.c
|
||||
ZROTKERNEL = ../arm/zrot.c
|
||||
endif
|
||||
|
||||
582
kernel/power/caxpy_power8.S
Normal file
582
kernel/power/caxpy_power8.S
Normal file
@@ -0,0 +1,582 @@
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
/*
|
||||
.file "caxpy.c"
|
||||
.abiversion 2
|
||||
.section ".text"
|
||||
.align 2
|
||||
.p2align 4,,15
|
||||
.globl caxpy_k
|
||||
.type caxpy_k, @function
|
||||
*/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
.LCF0:
|
||||
0: addis 2,12,.TOC.-.LCF0@ha
|
||||
addi 2,2,.TOC.-.LCF0@l
|
||||
#if _CALL_ELF ==2
|
||||
.localentry caxpy_k,.-caxpy_k
|
||||
#endif
|
||||
mr. 7,3
|
||||
ble 0,.L33
|
||||
cmpdi 7,9,1
|
||||
beq 7,.L41
|
||||
.L3:
|
||||
mtctr 7
|
||||
ld 7,96(1)
|
||||
sldi 9,9,3
|
||||
sldi 7,7,3
|
||||
.p2align 4,,15
|
||||
.L14:
|
||||
lfs 10,4(8)
|
||||
lfs 11,0(8)
|
||||
lfs 12,0(10)
|
||||
lfs 0,4(10)
|
||||
fmuls 10,2,10
|
||||
#ifdef CONJ
|
||||
fmadds 11,11,1,10
|
||||
#else
|
||||
fmsubs 11,11,1,10
|
||||
#endif
|
||||
fadds 12,12,11
|
||||
stfs 12,0(10)
|
||||
lfs 11,0(8)
|
||||
lfs 12,4(8)
|
||||
add 8,8,9
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmsubs 12,12,1,11
|
||||
fsubs 0,0,12
|
||||
#else
|
||||
fmadds 12,12,1,11
|
||||
fadds 0,0,12
|
||||
#endif
|
||||
stfs 0,4(10)
|
||||
add 10,10,7
|
||||
bdnz .L14
|
||||
.L33:
|
||||
li 3,0
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L41:
|
||||
ld 6,96(1)
|
||||
cmpdi 7,6,1
|
||||
bne 7,.L3
|
||||
rldicr. 4,7,0,59
|
||||
std 31,-8(1)
|
||||
li 11,0
|
||||
bne 0,.L42
|
||||
.L4:
|
||||
addi 6,11,8
|
||||
subf 0,4,7
|
||||
sldi 6,6,2
|
||||
addi 9,6,-32
|
||||
add 5,10,6
|
||||
add 3,8,9
|
||||
add 6,8,6
|
||||
subfc 5,5,3
|
||||
add 9,10,9
|
||||
subfe 5,5,5
|
||||
subfc 6,6,9
|
||||
subfe 31,31,31
|
||||
addi 6,5,1
|
||||
addi 5,31,1
|
||||
or 6,6,5
|
||||
rlwinm 6,6,0,0xff
|
||||
cmpwi 7,6,0
|
||||
beq 7,.L7
|
||||
sradi 6,4,63
|
||||
srdi 5,7,63
|
||||
subfc 31,7,4
|
||||
adde 6,5,6
|
||||
subfic 31,0,3
|
||||
subfe 31,31,31
|
||||
xori 6,6,0x1
|
||||
neg 31,31
|
||||
and 6,6,31
|
||||
rlwinm 6,6,0,0xff
|
||||
cmpwi 7,6,0
|
||||
beq 7,.L7
|
||||
cmpd 7,4,7
|
||||
li 6,1
|
||||
blt 7,.L43
|
||||
.L9:
|
||||
addi 0,7,-1
|
||||
subf 0,4,0
|
||||
subfic 0,0,3
|
||||
subfe 31,31,31
|
||||
addi 0,31,1
|
||||
rlwinm 0,0,0,0xff
|
||||
cmpwi 7,0,0
|
||||
bne 7,.L10
|
||||
sradi 0,4,63
|
||||
subfc 31,7,4
|
||||
adde 5,5,0
|
||||
rlwinm 5,5,0,0xff
|
||||
cmpwi 7,5,0
|
||||
bne 7,.L10
|
||||
addi 0,6,-1
|
||||
addis 31,2,.LC3@toc@ha
|
||||
std 30,-16(1)
|
||||
xscvdpspn 12,1
|
||||
xscvdpspn 11,2
|
||||
srdi. 30,0,2
|
||||
addis 6,2,.LC2@toc@ha
|
||||
addi 6,6,.LC2@toc@l
|
||||
mtctr 30
|
||||
addi 31,31,.LC3@toc@l
|
||||
lxvd2x 42,0,6
|
||||
li 5,16
|
||||
li 6,0
|
||||
lxvd2x 41,0,31
|
||||
xxspltw 12,12,0
|
||||
xxspltw 11,11,0
|
||||
xxpermdi 42,42,42,2
|
||||
xxpermdi 41,41,41,2
|
||||
beq 0,.L44
|
||||
.p2align 4,,15
|
||||
.L11:
|
||||
#ifdef CONJ
|
||||
lxvd2x 44,3,6
|
||||
lxvd2x 45,3,5
|
||||
lxvd2x 33,9,6
|
||||
lxvd2x 0,9,5
|
||||
xxpermdi 44,44,44,2
|
||||
xxpermdi 45,45,45,2
|
||||
xxpermdi 32,33,33,2
|
||||
xxpermdi 33,0,0,2
|
||||
vperm 11,13,12,10
|
||||
vperm 13,13,12,9
|
||||
vperm 12,1,0,10
|
||||
vperm 1,1,0,9
|
||||
xvmulsp 0,11,43
|
||||
xvmulsp 32,11,45
|
||||
xvmsubmsp 45,12,0
|
||||
xvmaddasp 32,12,43
|
||||
xvaddsp 44,32,44
|
||||
xvsubsp 32,33,45
|
||||
vmrglw 1,0,12
|
||||
vmrghw 0,0,12
|
||||
#else
|
||||
lxvd2x 45,3,6
|
||||
lxvd2x 33,3,5
|
||||
lxvd2x 43,9,6
|
||||
lxvd2x 0,9,5
|
||||
xxpermdi 45,45,45,2
|
||||
xxpermdi 33,33,33,2
|
||||
xxpermdi 32,43,43,2
|
||||
xxpermdi 43,0,0,2
|
||||
vperm 12,1,13,10
|
||||
vperm 1,1,13,9
|
||||
vperm 13,11,0,10
|
||||
vperm 11,11,0,9
|
||||
xvmulsp 0,11,44
|
||||
xvmulsp 32,11,33
|
||||
xvmaddmsp 33,12,0
|
||||
xvmsubasp 32,12,44
|
||||
xvaddsp 45,32,45
|
||||
xvaddsp 32,33,43
|
||||
vmrglw 1,0,13
|
||||
vmrghw 0,0,13
|
||||
#endif
|
||||
xxpermdi 0,33,33,2
|
||||
xxpermdi 32,32,32,2
|
||||
stxvd2x 0,9,6
|
||||
addi 6,6,32
|
||||
stxvd2x 32,9,5
|
||||
addi 5,5,32
|
||||
bdnz .L11
|
||||
rldicr 0,0,0,61
|
||||
ld 30,-16(1)
|
||||
sldi 9,0,1
|
||||
add 4,4,0
|
||||
add 11,11,9
|
||||
.L10:
|
||||
sldi 6,11,2
|
||||
addi 9,4,1
|
||||
addi 5,6,4
|
||||
cmpd 7,7,9
|
||||
lfsx 12,8,6
|
||||
lfsx 0,10,6
|
||||
addi 9,11,2
|
||||
lfsx 11,8,5
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmadds 12,12,1,11
|
||||
#else
|
||||
fmsubs 12,12,1,11
|
||||
#endif
|
||||
fadds 0,0,12
|
||||
stfsx 0,10,6
|
||||
lfsx 11,8,6
|
||||
lfsx 12,8,5
|
||||
lfsx 0,10,5
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmsubs 12,12,1,11
|
||||
fsubs 0,0,12
|
||||
#else
|
||||
fmadds 12,12,1,11
|
||||
fadds 0,0,12
|
||||
#endif
|
||||
stfsx 0,10,5
|
||||
ble 7,.L39
|
||||
sldi 9,9,2
|
||||
addi 6,4,2
|
||||
addi 5,9,4
|
||||
cmpd 7,7,6
|
||||
lfsx 12,8,9
|
||||
lfsx 0,10,9
|
||||
addi 6,11,4
|
||||
lfsx 11,8,5
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmadds 12,1,12,11
|
||||
#else
|
||||
fmsubs 12,1,12,11
|
||||
#endif
|
||||
fadds 0,0,12
|
||||
stfsx 0,10,9
|
||||
lfsx 11,8,9
|
||||
lfsx 12,8,5
|
||||
lfsx 0,10,5
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmsubs 12,1,12,11
|
||||
fsubs 0,0,12
|
||||
#else
|
||||
fmadds 12,1,12,11
|
||||
fadds 0,0,12
|
||||
#endif
|
||||
stfsx 0,10,5
|
||||
ble 7,.L39
|
||||
sldi 6,6,2
|
||||
addi 4,4,3
|
||||
addi 5,6,4
|
||||
cmpd 7,7,4
|
||||
lfsx 12,8,6
|
||||
lfsx 0,10,6
|
||||
addi 9,11,6
|
||||
lfsx 11,8,5
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmadds 12,1,12,11
|
||||
#else
|
||||
fmsubs 12,1,12,11
|
||||
#endif
|
||||
fadds 0,0,12
|
||||
stfsx 0,10,6
|
||||
lfsx 11,8,6
|
||||
lfsx 12,8,5
|
||||
lfsx 0,10,5
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmsubs 12,1,12,11
|
||||
fsubs 0,0,12
|
||||
#else
|
||||
fmadds 12,1,12,11
|
||||
fadds 0,0,12
|
||||
#endif
|
||||
stfsx 0,10,5
|
||||
ble 7,.L39
|
||||
sldi 9,9,2
|
||||
ld 31,-8(1)
|
||||
addi 7,9,4
|
||||
lfsx 12,8,9
|
||||
lfsx 0,10,9
|
||||
lfsx 11,8,7
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmadds 12,1,12,11
|
||||
#else
|
||||
fmsubs 12,1,12,11
|
||||
#endif
|
||||
fadds 0,0,12
|
||||
stfsx 0,10,9
|
||||
lfsx 11,8,9
|
||||
lfsx 12,8,7
|
||||
lfsx 0,10,7
|
||||
fmuls 2,2,11
|
||||
#ifdef CONJ
|
||||
fmsubs 1,1,12,2
|
||||
fsubs 1,0,1
|
||||
#else
|
||||
fmadds 1,1,12,2
|
||||
fadds 1,0,1
|
||||
#endif
|
||||
stfsx 1,10,7
|
||||
b .L33
|
||||
.L43:
|
||||
mr 6,0
|
||||
b .L9
|
||||
.L7:
|
||||
addi 10,4,1
|
||||
cmpd 7,10,7
|
||||
subf 10,4,7
|
||||
mtctr 10
|
||||
bgt 7,.L26
|
||||
li 10,-1
|
||||
rldicr 10,10,0,0
|
||||
cmpd 7,7,10
|
||||
beq 7,.L26
|
||||
.p2align 4,,15
|
||||
.L13:
|
||||
lfs 10,4(3)
|
||||
lfs 11,0(3)
|
||||
addi 9,9,8
|
||||
addi 3,3,8
|
||||
lfs 12,-8(9)
|
||||
lfs 0,-4(9)
|
||||
fmuls 10,2,10
|
||||
#ifdef CONJ
|
||||
fmadds 11,1,11,10
|
||||
#else
|
||||
fmsubs 11,1,11,10
|
||||
#endif
|
||||
fadds 12,12,11
|
||||
stfs 12,-8(9)
|
||||
lfs 11,-8(3)
|
||||
lfs 12,-4(3)
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmsubs 12,1,12,11
|
||||
fsubs 0,0,12
|
||||
#else
|
||||
fmadds 12,1,12,11
|
||||
fadds 0,0,12
|
||||
#endif
|
||||
stfs 0,-4(9)
|
||||
bdnz .L13
|
||||
.L39:
|
||||
ld 31,-8(1)
|
||||
b .L33
|
||||
.L42:
|
||||
#ifdef CONJ
|
||||
fneg 0,1
|
||||
xxpermdi 32,1,1,0
|
||||
addis 9,2,.LANCHOR0@toc@ha
|
||||
std 28,-32(1)
|
||||
sradi. 28,4,1
|
||||
addi 9,9,.LANCHOR0@toc@l
|
||||
xscvdpspn 5,2
|
||||
xvcvdpsp 32,32
|
||||
lxvd2x 12,0,9
|
||||
xxpermdi 39,0,0,0
|
||||
xxspltw 5,5,0
|
||||
xvcvdpsp 39,39
|
||||
#else
|
||||
fneg 0,2
|
||||
xxpermdi 39,2,2,0
|
||||
addis 9,2,.LANCHOR0@toc@ha
|
||||
std 28,-32(1)
|
||||
sradi. 28,4,1
|
||||
addi 9,9,.LANCHOR0@toc@l
|
||||
xscvdpspn 5,1
|
||||
xvcvdpsp 39,39
|
||||
lxvd2x 12,0,9
|
||||
xxpermdi 32,0,0,0
|
||||
xxspltw 5,5,0
|
||||
xvcvdpsp 32,32
|
||||
#endif
|
||||
xxpermdi 12,12,12,2
|
||||
vmrgew 7,7,0
|
||||
beq 0,.L5
|
||||
xxlnor 38,12,12
|
||||
std 29,-24(1)
|
||||
std 30,-16(1)
|
||||
mr 6,8
|
||||
mr 9,10
|
||||
li 29,0
|
||||
li 30,16
|
||||
li 31,32
|
||||
li 12,48
|
||||
li 0,64
|
||||
li 11,80
|
||||
li 3,96
|
||||
li 5,112
|
||||
.p2align 4,,15
|
||||
.L6:
|
||||
lxvd2x 6,0,9
|
||||
lxvd2x 40,0,6
|
||||
addi 29,29,8
|
||||
lxvd2x 41,6,30
|
||||
lxvd2x 42,6,31
|
||||
cmpd 7,28,29
|
||||
lxvd2x 43,6,12
|
||||
lxvd2x 44,6,0
|
||||
lxvd2x 45,6,11
|
||||
lxvd2x 33,6,3
|
||||
lxvd2x 32,6,5
|
||||
lxvd2x 7,9,30
|
||||
addi 6,6,128
|
||||
lxvd2x 8,9,31
|
||||
lxvd2x 9,9,12
|
||||
xxpermdi 40,40,40,2
|
||||
xxpermdi 6,6,6,2
|
||||
lxvd2x 10,9,0
|
||||
lxvd2x 11,9,11
|
||||
xxpermdi 41,41,41,2
|
||||
xxpermdi 42,42,42,2
|
||||
lxvd2x 12,9,3
|
||||
lxvd2x 0,9,5
|
||||
xxpermdi 43,43,43,2
|
||||
xxpermdi 44,44,44,2
|
||||
xxpermdi 45,45,45,2
|
||||
xxpermdi 33,33,33,2
|
||||
xxpermdi 32,32,32,2
|
||||
xxpermdi 7,7,7,2
|
||||
xxpermdi 8,8,8,2
|
||||
xxpermdi 9,9,9,2
|
||||
xxpermdi 10,10,10,2
|
||||
xxpermdi 11,11,11,2
|
||||
xxpermdi 12,12,12,2
|
||||
xxpermdi 0,0,0,2
|
||||
#ifndef CONJ
|
||||
xvmaddasp 6,5,40
|
||||
xvmaddasp 7,5,41
|
||||
xvmaddasp 8,5,42
|
||||
xvmaddasp 9,5,43
|
||||
xvmaddasp 10,5,44
|
||||
xvmaddasp 11,5,45
|
||||
xvmaddasp 12,5,33
|
||||
xvmaddasp 0,5,32
|
||||
vperm 8,8,8,6
|
||||
vperm 9,9,9,6
|
||||
vperm 10,10,10,6
|
||||
vperm 11,11,11,6
|
||||
vperm 12,12,12,6
|
||||
vperm 13,13,13,6
|
||||
vperm 1,1,1,6
|
||||
vperm 0,0,0,6
|
||||
#endif
|
||||
xvmaddasp 6,39,40
|
||||
xvmaddasp 7,39,41
|
||||
xvmaddasp 8,39,42
|
||||
xvmaddasp 9,39,43
|
||||
xvmaddasp 10,39,44
|
||||
xvmaddasp 11,39,45
|
||||
xvmaddasp 12,39,33
|
||||
xvmaddasp 0,39,32
|
||||
#ifdef CONJ
|
||||
vperm 8,8,8,6
|
||||
vperm 9,9,9,6
|
||||
vperm 10,10,10,6
|
||||
vperm 11,11,11,6
|
||||
vperm 12,12,12,6
|
||||
vperm 13,13,13,6
|
||||
vperm 1,1,1,6
|
||||
vperm 0,0,0,6
|
||||
xvmaddasp 6,5,40
|
||||
xvmaddasp 7,5,41
|
||||
xvmaddasp 8,5,42
|
||||
xvmaddasp 9,5,43
|
||||
xvmaddasp 10,5,44
|
||||
xvmaddasp 11,5,45
|
||||
xvmaddasp 12,5,33
|
||||
xvmaddasp 0,5,32
|
||||
#endif
|
||||
xxpermdi 6,6,6,2
|
||||
xxpermdi 7,7,7,2
|
||||
xxpermdi 8,8,8,2
|
||||
xxpermdi 9,9,9,2
|
||||
stxvd2x 6,0,9
|
||||
xxpermdi 10,10,10,2
|
||||
stxvd2x 7,9,30
|
||||
xxpermdi 11,11,11,2
|
||||
stxvd2x 8,9,31
|
||||
xxpermdi 12,12,12,2
|
||||
stxvd2x 9,9,12
|
||||
xxpermdi 0,0,0,2
|
||||
stxvd2x 10,9,0
|
||||
stxvd2x 11,9,11
|
||||
stxvd2x 12,9,3
|
||||
stxvd2x 0,9,5
|
||||
addi 9,9,128
|
||||
bgt 7,.L6
|
||||
ld 29,-24(1)
|
||||
ld 30,-16(1)
|
||||
.L5:
|
||||
cmpd 7,7,4
|
||||
ble 7,.L36
|
||||
sldi 11,4,1
|
||||
ld 28,-32(1)
|
||||
b .L4
|
||||
.L36:
|
||||
ld 28,-32(1)
|
||||
ld 31,-8(1)
|
||||
b .L33
|
||||
.L44:
|
||||
li 31,1
|
||||
mtctr 31
|
||||
b .L11
|
||||
.L26:
|
||||
li 10,1
|
||||
mtctr 10
|
||||
b .L13
|
||||
.long 0
|
||||
.byte 0,0,0,0,0,4,0,0
|
||||
#if _CALL_ELF ==2
|
||||
.size caxpy_k,.-caxpy_k
|
||||
#endif
|
||||
.section .rodata
|
||||
.align 4
|
||||
.set .LANCHOR0,. + 0
|
||||
.type swap_mask_arr, @object
|
||||
.size swap_mask_arr, 16
|
||||
swap_mask_arr:
|
||||
.byte 4
|
||||
.byte 5
|
||||
.byte 6
|
||||
.byte 7
|
||||
.byte 0
|
||||
.byte 1
|
||||
.byte 2
|
||||
.byte 3
|
||||
.byte 12
|
||||
.byte 13
|
||||
.byte 14
|
||||
.byte 15
|
||||
.byte 8
|
||||
.byte 9
|
||||
.byte 10
|
||||
.byte 11
|
||||
.section .rodata.cst16,"aM",@progbits,16
|
||||
.align 4
|
||||
.LC2:
|
||||
.byte 31
|
||||
.byte 30
|
||||
.byte 29
|
||||
.byte 28
|
||||
.byte 23
|
||||
.byte 22
|
||||
.byte 21
|
||||
.byte 20
|
||||
.byte 15
|
||||
.byte 14
|
||||
.byte 13
|
||||
.byte 12
|
||||
.byte 7
|
||||
.byte 6
|
||||
.byte 5
|
||||
.byte 4
|
||||
.LC3:
|
||||
.byte 27
|
||||
.byte 26
|
||||
.byte 25
|
||||
.byte 24
|
||||
.byte 19
|
||||
.byte 18
|
||||
.byte 17
|
||||
.byte 16
|
||||
.byte 11
|
||||
.byte 10
|
||||
.byte 9
|
||||
.byte 8
|
||||
.byte 3
|
||||
.byte 2
|
||||
.byte 1
|
||||
.byte 0
|
||||
.ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]"
|
||||
.gnu_attribute 4, 1
|
||||
.section .note.GNU-stack,"",@progbits
|
||||
538
kernel/power/caxpy_power9.S
Normal file
538
kernel/power/caxpy_power9.S
Normal file
@@ -0,0 +1,538 @@
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
/*
|
||||
.file "caxpy.c"
|
||||
.abiversion 2
|
||||
.section ".text"
|
||||
.align 2
|
||||
.p2align 4,,15
|
||||
.globl caxpy_k
|
||||
.type caxpy_k, @function
|
||||
*/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
caxpy_k:
|
||||
.LCF0:
|
||||
0: addis 2,12,.TOC.-.LCF0@ha
|
||||
addi 2,2,.TOC.-.LCF0@l
|
||||
.localentry caxpy_k,.-caxpy_k
|
||||
mr. 7,3
|
||||
ble 0,.L33
|
||||
cmpdi 7,9,1
|
||||
beq 7,.L37
|
||||
.L3:
|
||||
mtctr 7
|
||||
ld 7,96(1)
|
||||
sldi 9,9,3
|
||||
sldi 7,7,3
|
||||
.p2align 4,,15
|
||||
.L14:
|
||||
lfs 10,4(8)
|
||||
lfs 11,0(8)
|
||||
lfs 12,0(10)
|
||||
lfs 0,4(10)
|
||||
fmuls 10,2,10
|
||||
#ifdef CONJ
|
||||
fmadds 11,11,1,10
|
||||
#else
|
||||
fmsubs 11,11,1,10
|
||||
#endif
|
||||
fadds 12,12,11
|
||||
stfs 12,0(10)
|
||||
lfs 11,0(8)
|
||||
lfs 12,4(8)
|
||||
add 8,8,9
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmsubs 12,12,1,11
|
||||
fsubs 0,0,12
|
||||
#else
|
||||
fmadds 12,12,1,11
|
||||
fadds 0,0,12
|
||||
#endif
|
||||
stfs 0,4(10)
|
||||
add 10,10,7
|
||||
bdnz .L14
|
||||
.L33:
|
||||
li 3,0
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L37:
|
||||
ld 6,96(1)
|
||||
cmpdi 7,6,1
|
||||
bne 7,.L3
|
||||
rldicr. 4,7,0,59
|
||||
li 11,0
|
||||
bne 0,.L38
|
||||
.L4:
|
||||
addi 6,11,8
|
||||
subf 0,4,7
|
||||
sldi 6,6,2
|
||||
addi 9,6,-32
|
||||
add 5,10,6
|
||||
add 6,8,6
|
||||
add 3,8,9
|
||||
add 9,10,9
|
||||
subfc 5,5,3
|
||||
subfe 5,5,5
|
||||
subfc 6,6,9
|
||||
subfe 12,12,12
|
||||
addi 6,5,1
|
||||
addi 5,12,1
|
||||
or 6,6,5
|
||||
rlwinm 6,6,0,0xff
|
||||
cmpwi 7,6,0
|
||||
beq 7,.L7
|
||||
sradi 6,4,63
|
||||
srdi 5,7,63
|
||||
subfc 12,7,4
|
||||
adde 6,5,6
|
||||
subfic 12,0,4
|
||||
subfe 12,12,12
|
||||
xori 6,6,0x1
|
||||
neg 12,12
|
||||
and 6,6,12
|
||||
rlwinm 6,6,0,0xff
|
||||
cmpwi 7,6,0
|
||||
beq 7,.L7
|
||||
cmpd 7,4,7
|
||||
li 6,1
|
||||
blt 7,.L39
|
||||
.L9:
|
||||
addi 0,7,-1
|
||||
subf 0,4,0
|
||||
subfic 0,0,3
|
||||
subfe 12,12,12
|
||||
addi 0,12,1
|
||||
rlwinm 0,0,0,0xff
|
||||
cmpwi 7,0,0
|
||||
bne 7,.L10
|
||||
sradi 0,4,63
|
||||
subfc 12,7,4
|
||||
adde 5,5,0
|
||||
rlwinm 5,5,0,0xff
|
||||
cmpwi 7,5,0
|
||||
bne 7,.L10
|
||||
xscvdpspn 0,1
|
||||
xscvdpspn 12,2
|
||||
addi 0,6,-1
|
||||
std 31,-8(1)
|
||||
addis 12,2,.LC2@toc@ha
|
||||
addis 6,2,.LC3@toc@ha
|
||||
li 5,16
|
||||
srdi. 31,0,2
|
||||
addi 6,6,.LC3@toc@l
|
||||
addi 12,12,.LC2@toc@l
|
||||
mtctr 31
|
||||
lxv 41,0(6)
|
||||
lxv 42,0(12)
|
||||
li 6,0
|
||||
xxspltw 0,0,0
|
||||
xxspltw 12,12,0
|
||||
beq 0,.L40
|
||||
.p2align 4,,15
|
||||
.L11:
|
||||
#ifdef CONJ
|
||||
lxvx 33,3,5
|
||||
lxvx 44,3,6
|
||||
lxvx 43,9,6
|
||||
lxvx 32,9,5
|
||||
vperm 13,1,12,10
|
||||
vperm 12,1,12,9
|
||||
vperm 8,0,11,10
|
||||
vperm 0,0,11,9
|
||||
xvmulsp 33,12,44
|
||||
xvmulsp 11,12,45
|
||||
xvmaddasp 33,0,45
|
||||
xvmsubmsp 44,0,11
|
||||
xvaddsp 33,33,40
|
||||
xvsubsp 32,32,44
|
||||
#else
|
||||
lxvx 33,3,6
|
||||
lxvx 32,3,5
|
||||
lxvx 43,9,6
|
||||
lxvx 44,9,5
|
||||
vperm 13,0,1,10
|
||||
vperm 0,0,1,9
|
||||
vperm 8,12,11,10
|
||||
vperm 12,12,11,9
|
||||
xvmulsp 33,12,32
|
||||
xvmulsp 11,12,45
|
||||
xvmsubasp 33,0,45
|
||||
xvmaddmsp 32,0,11
|
||||
xvaddsp 33,33,40
|
||||
xvaddsp 32,32,44
|
||||
#endif
|
||||
vmrglw 13,0,1
|
||||
vmrghw 0,0,1
|
||||
stxvx 45,9,6
|
||||
stxvx 32,9,5
|
||||
addi 6,6,32
|
||||
addi 5,5,32
|
||||
bdnz .L11
|
||||
rldicr 0,0,0,61
|
||||
ld 31,-8(1)
|
||||
sldi 9,0,1
|
||||
add 4,4,0
|
||||
add 11,11,9
|
||||
.L10:
|
||||
sldi 5,11,2
|
||||
addi 6,4,1
|
||||
addi 9,11,2
|
||||
addi 3,5,4
|
||||
lfsx 12,8,5
|
||||
cmpd 7,7,6
|
||||
lfsx 0,10,5
|
||||
lfsx 11,8,3
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmadds 12,12,1,11
|
||||
#else
|
||||
fmsubs 12,12,1,11
|
||||
#endif
|
||||
fadds 0,0,12
|
||||
stfsx 0,10,5
|
||||
lfsx 11,8,5
|
||||
lfsx 12,8,3
|
||||
lfsx 0,10,3
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmsubs 12,12,1,11
|
||||
fsubs 0,0,12
|
||||
#else
|
||||
fmadds 12,12,1,11
|
||||
fadds 0,0,12
|
||||
#endif
|
||||
stfsx 0,10,3
|
||||
ble 7,.L33
|
||||
sldi 9,9,2
|
||||
addi 5,4,2
|
||||
addi 6,11,4
|
||||
addi 3,9,4
|
||||
lfsx 12,8,9
|
||||
cmpd 7,7,5
|
||||
lfsx 0,10,9
|
||||
lfsx 11,8,3
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmadds 12,1,12,11
|
||||
#else
|
||||
fmsubs 12,1,12,11
|
||||
#endif
|
||||
fadds 0,0,12
|
||||
stfsx 0,10,9
|
||||
lfsx 11,8,9
|
||||
lfsx 12,8,3
|
||||
lfsx 0,10,3
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmsubs 12,1,12,11
|
||||
fsubs 0,0,12
|
||||
#else
|
||||
fmadds 12,1,12,11
|
||||
fadds 0,0,12
|
||||
#endif
|
||||
stfsx 0,10,3
|
||||
ble 7,.L33
|
||||
sldi 6,6,2
|
||||
addi 4,4,3
|
||||
addi 9,11,6
|
||||
addi 5,6,4
|
||||
lfsx 12,8,6
|
||||
cmpd 7,7,4
|
||||
lfsx 0,10,6
|
||||
lfsx 11,8,5
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmadds 12,1,12,11
|
||||
#else
|
||||
fmsubs 12,1,12,11
|
||||
#endif
|
||||
fadds 0,0,12
|
||||
stfsx 0,10,6
|
||||
lfsx 11,8,6
|
||||
lfsx 12,8,5
|
||||
lfsx 0,10,5
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmsubs 12,1,12,11
|
||||
fsubs 0,0,12
|
||||
#else
|
||||
fmadds 12,1,12,11
|
||||
fadds 0,0,12
|
||||
#endif
|
||||
stfsx 0,10,5
|
||||
ble 7,.L33
|
||||
sldi 9,9,2
|
||||
addi 7,9,4
|
||||
lfsx 12,8,9
|
||||
lfsx 0,10,9
|
||||
lfsx 11,8,7
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmadds 12,1,12,11
|
||||
#else
|
||||
fmsubs 12,1,12,11
|
||||
#endif
|
||||
fadds 0,0,12
|
||||
stfsx 0,10,9
|
||||
lfsx 11,8,9
|
||||
lfsx 12,8,7
|
||||
lfsx 0,10,7
|
||||
fmuls 2,2,11
|
||||
#ifdef CONJ
|
||||
fmsubs 1,1,12,2
|
||||
fsubs 1,0,1
|
||||
#else
|
||||
fmadds 1,1,12,2
|
||||
fadds 1,0,1
|
||||
#endif
|
||||
stfsx 1,10,7
|
||||
b .L33
|
||||
.L39:
|
||||
mr 6,0
|
||||
b .L9
|
||||
.L38:
|
||||
#ifdef CONJ
|
||||
fneg 0,1
|
||||
xxpermdi 45,1,1,0
|
||||
xscvdpspn 12,2
|
||||
addis 9,2,.LANCHOR0@toc@ha
|
||||
sradi. 3,4,1
|
||||
xxpermdi 44,0,0,0
|
||||
addi 9,9,.LANCHOR0@toc@l
|
||||
xvcvdpsp 45,45
|
||||
lxv 33,0(9)
|
||||
xvcvdpsp 32,44
|
||||
xxspltw 12,12,0
|
||||
#else
|
||||
fneg 12,2
|
||||
xxpermdi 32,2,2,0
|
||||
xscvdpspn 0,1
|
||||
addis 9,2,.LANCHOR0@toc@ha
|
||||
sradi. 3,4,1
|
||||
xxpermdi 45,12,12,0
|
||||
addi 9,9,.LANCHOR0@toc@l
|
||||
xvcvdpsp 32,32
|
||||
lxv 33,0(9)
|
||||
xvcvdpsp 45,45
|
||||
xxspltw 0,0,0
|
||||
#endif
|
||||
vmrgew 0,0,13
|
||||
beq 0,.L5
|
||||
mr 6,8
|
||||
mr 9,10
|
||||
li 5,0
|
||||
.p2align 4,,15
|
||||
.L6:
|
||||
lxv 38,16(6)
|
||||
lxv 11,16(9)
|
||||
addi 5,5,8
|
||||
addi 6,6,128
|
||||
addi 9,9,128
|
||||
lxv 39,-96(6)
|
||||
lxv 40,-80(6)
|
||||
lxv 41,-64(6)
|
||||
lxv 42,-48(6)
|
||||
cmpd 7,3,5
|
||||
lxv 43,-32(6)
|
||||
lxv 45,-128(6)
|
||||
lxv 44,-16(6)
|
||||
#ifdef CONJ
|
||||
lxv 0,-128(9)
|
||||
vpermr 17,6,6,1
|
||||
xvmaddmsp 38,32,11
|
||||
lxv 11,-96(9)
|
||||
vpermr 18,7,7,1
|
||||
vpermr 19,8,8,1
|
||||
vpermr 2,9,9,1
|
||||
vpermr 3,10,10,1
|
||||
vpermr 4,11,11,1
|
||||
xvmaddasp 0,32,45
|
||||
vpermr 5,12,12,1
|
||||
xvmaddmsp 39,32,11
|
||||
lxv 11,-80(9)
|
||||
vpermr 13,13,13,1
|
||||
xvmaddasp 38,12,49
|
||||
xvmaddmsp 40,32,11
|
||||
lxv 11,-64(9)
|
||||
xvmaddmsp 45,12,0
|
||||
xvmaddasp 39,12,50
|
||||
stxv 38,-112(9)
|
||||
xvmaddmsp 41,32,11
|
||||
lxv 11,-48(9)
|
||||
xvmaddasp 40,12,51
|
||||
stxv 45,-128(9)
|
||||
stxv 39,-96(9)
|
||||
xvmaddmsp 42,32,11
|
||||
lxv 11,-32(9)
|
||||
xvmaddasp 41,12,34
|
||||
stxv 40,-80(9)
|
||||
xvmaddmsp 43,32,11
|
||||
lxv 11,-16(9)
|
||||
xvmaddasp 42,12,35
|
||||
stxv 41,-64(9)
|
||||
xvmaddmsp 44,32,11
|
||||
xvmaddasp 43,12,36
|
||||
stxv 42,-48(9)
|
||||
xvmaddasp 44,12,37
|
||||
#else
|
||||
lxv 12,-128(9)
|
||||
vpermr 17,6,6,1
|
||||
xvmaddmsp 38,0,11
|
||||
lxv 11,-96(9)
|
||||
vpermr 18,7,7,1
|
||||
vpermr 19,8,8,1
|
||||
vpermr 2,9,9,1
|
||||
vpermr 3,10,10,1
|
||||
vpermr 4,11,11,1
|
||||
xvmaddasp 12,0,45
|
||||
vpermr 5,12,12,1
|
||||
xvmaddmsp 39,0,11
|
||||
lxv 11,-80(9)
|
||||
vpermr 13,13,13,1
|
||||
xvmaddasp 38,32,49
|
||||
xvmaddmsp 40,0,11
|
||||
lxv 11,-64(9)
|
||||
xvmaddmsp 45,32,12
|
||||
xvmaddasp 39,32,50
|
||||
stxv 38,-112(9)
|
||||
xvmaddmsp 41,0,11
|
||||
lxv 11,-48(9)
|
||||
xvmaddasp 40,32,51
|
||||
stxv 45,-128(9)
|
||||
stxv 39,-96(9)
|
||||
xvmaddmsp 42,0,11
|
||||
lxv 11,-32(9)
|
||||
xvmaddasp 41,32,34
|
||||
stxv 40,-80(9)
|
||||
xvmaddmsp 43,0,11
|
||||
lxv 11,-16(9)
|
||||
xvmaddasp 42,32,35
|
||||
stxv 41,-64(9)
|
||||
xvmaddmsp 44,0,11
|
||||
xvmaddasp 43,32,36
|
||||
stxv 42,-48(9)
|
||||
xvmaddasp 44,32,37
|
||||
#endif
|
||||
stxv 43,-32(9)
|
||||
stxv 44,-16(9)
|
||||
bgt 7,.L6
|
||||
.L5:
|
||||
cmpd 7,7,4
|
||||
ble 7,.L33
|
||||
sldi 11,4,1
|
||||
b .L4
|
||||
.L7:
|
||||
addi 10,4,1
|
||||
subf 8,4,7
|
||||
cmpd 7,10,7
|
||||
mtctr 8
|
||||
bgt 7,.L26
|
||||
li 10,-1
|
||||
rldicr 10,10,0,0
|
||||
cmpd 7,7,10
|
||||
beq 7,.L26
|
||||
.p2align 4,,15
|
||||
.L13:
|
||||
lfs 10,4(3)
|
||||
lfs 11,0(3)
|
||||
lfs 12,0(9)
|
||||
lfs 0,4(9)
|
||||
addi 3,3,8
|
||||
addi 9,9,8
|
||||
fmuls 10,2,10
|
||||
#ifdef CONJ
|
||||
fmadds 11,1,11,10
|
||||
#else
|
||||
fmsubs 11,1,11,10
|
||||
#endif
|
||||
fadds 12,12,11
|
||||
stfs 12,-8(9)
|
||||
lfs 11,-8(3)
|
||||
lfs 12,-4(3)
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmsubs 12,1,12,11
|
||||
fsubs 0,0,12
|
||||
#else
|
||||
fmadds 12,1,12,11
|
||||
fadds 0,0,12
|
||||
#endif
|
||||
stfs 0,-4(9)
|
||||
bdnz .L13
|
||||
b .L33
|
||||
.L40:
|
||||
li 31,1
|
||||
mtctr 31
|
||||
b .L11
|
||||
.L26:
|
||||
li 10,1
|
||||
mtctr 10
|
||||
b .L13
|
||||
.long 0
|
||||
.byte 0,0,0,0,0,1,0,0
|
||||
.size caxpy_k,.-caxpy_k
|
||||
.section .rodata
|
||||
.align 4
|
||||
.set .LANCHOR0,. + 0
|
||||
.type swap_mask_arr, @object
|
||||
.size swap_mask_arr, 16
|
||||
swap_mask_arr:
|
||||
.byte 4
|
||||
.byte 5
|
||||
.byte 6
|
||||
.byte 7
|
||||
.byte 0
|
||||
.byte 1
|
||||
.byte 2
|
||||
.byte 3
|
||||
.byte 12
|
||||
.byte 13
|
||||
.byte 14
|
||||
.byte 15
|
||||
.byte 8
|
||||
.byte 9
|
||||
.byte 10
|
||||
.byte 11
|
||||
.section .rodata.cst16,"aM",@progbits,16
|
||||
.align 4
|
||||
.LC2:
|
||||
.byte 31
|
||||
.byte 30
|
||||
.byte 29
|
||||
.byte 28
|
||||
.byte 23
|
||||
.byte 22
|
||||
.byte 21
|
||||
.byte 20
|
||||
.byte 15
|
||||
.byte 14
|
||||
.byte 13
|
||||
.byte 12
|
||||
.byte 7
|
||||
.byte 6
|
||||
.byte 5
|
||||
.byte 4
|
||||
.LC3:
|
||||
.byte 27
|
||||
.byte 26
|
||||
.byte 25
|
||||
.byte 24
|
||||
.byte 19
|
||||
.byte 18
|
||||
.byte 17
|
||||
.byte 16
|
||||
.byte 11
|
||||
.byte 10
|
||||
.byte 9
|
||||
.byte 8
|
||||
.byte 3
|
||||
.byte 2
|
||||
.byte 1
|
||||
.byte 0
|
||||
.ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]"
|
||||
.gnu_attribute 4, 1
|
||||
.section .note.GNU-stack,"",@progbits
|
||||
248
kernel/power/cdot_power9.S
Normal file
248
kernel/power/cdot_power9.S
Normal file
@@ -0,0 +1,248 @@
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
/*
|
||||
.file "cdot.c"
|
||||
.abiversion 2
|
||||
.section ".text"
|
||||
.align 2
|
||||
.p2align 4,,15
|
||||
.globl cdot_k
|
||||
.type cdot_k, @function
|
||||
*/
|
||||
PROLOGUE
|
||||
|
||||
cdot_k:
|
||||
.LCF0:
|
||||
0: addis 2,12,.TOC.-.LCF0@ha
|
||||
addi 2,2,.TOC.-.LCF0@l
|
||||
.localentry cdot_k,.-cdot_k
|
||||
mr. 9,3
|
||||
ble 0,.L10
|
||||
cmpdi 7,5,1
|
||||
beq 7,.L18
|
||||
.L3:
|
||||
mtctr 9
|
||||
xxlxor 2,2,2
|
||||
sldi 5,5,3
|
||||
sldi 7,7,3
|
||||
#ifdef CONJ
|
||||
fmr 12,2
|
||||
#endif
|
||||
fmr 8,2
|
||||
#ifndef CONJ
|
||||
fmr 9,2
|
||||
#endif
|
||||
fmr 1,2
|
||||
.p2align 4,,15
|
||||
.L9:
|
||||
#ifdef CONJ
|
||||
lfs 9,0(4)
|
||||
lfs 11,0(6)
|
||||
lfs 10,4(6)
|
||||
lfs 0,4(4)
|
||||
add 6,6,7
|
||||
add 4,4,5
|
||||
fmadds 1,9,11,1
|
||||
fmadds 12,9,10,12
|
||||
fmadds 8,0,10,8
|
||||
fmadds 2,11,0,2
|
||||
#else
|
||||
lfs 10,0(4)
|
||||
lfs 12,0(6)
|
||||
lfs 11,4(6)
|
||||
lfs 0,4(4)
|
||||
add 6,6,7
|
||||
add 4,4,5
|
||||
fmadds 1,10,12,1
|
||||
fmadds 8,10,11,8
|
||||
fmadds 9,0,11,9
|
||||
fmadds 2,12,0,2
|
||||
#endif
|
||||
bdnz .L9
|
||||
.L7:
|
||||
#ifdef CONJ
|
||||
fsubs 2,12,2
|
||||
fadds 1,1,8
|
||||
#else
|
||||
fadds 2,2,8
|
||||
fsubs 1,1,9
|
||||
#endif
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L18:
|
||||
cmpdi 7,7,1
|
||||
bne 7,.L3
|
||||
rldicr. 10,9,0,60
|
||||
bne 0,.L19
|
||||
xxlxor 2,2,2
|
||||
li 8,0
|
||||
#ifdef CONJ
|
||||
fmr 12,2
|
||||
#endif
|
||||
fmr 8,2
|
||||
#ifndef CONJ
|
||||
fmr 9,2
|
||||
#endif
|
||||
fmr 1,2
|
||||
.L4:
|
||||
addi 7,10,1
|
||||
sldi 8,8,2
|
||||
subf 10,10,9
|
||||
cmpd 7,7,9
|
||||
mtctr 10
|
||||
add 4,4,8
|
||||
add 6,6,8
|
||||
bgt 7,.L16
|
||||
li 10,-1
|
||||
rldicr 10,10,0,0
|
||||
cmpd 7,9,10
|
||||
beq 7,.L16
|
||||
.p2align 4,,15
|
||||
.L8:
|
||||
#ifdef CONJ
|
||||
lfs 9,0(4)
|
||||
lfs 11,0(6)
|
||||
lfs 10,4(6)
|
||||
lfs 0,4(4)
|
||||
addi 6,6,8
|
||||
addi 4,4,8
|
||||
fmadds 1,9,11,1
|
||||
fmadds 12,9,10,12
|
||||
fmadds 8,0,10,8
|
||||
fmadds 2,11,0,2
|
||||
#else
|
||||
lfs 10,0(4)
|
||||
lfs 12,0(6)
|
||||
lfs 11,4(6)
|
||||
lfs 0,4(4)
|
||||
addi 6,6,8
|
||||
addi 4,4,8
|
||||
fmadds 1,10,12,1
|
||||
fmadds 8,10,11,8
|
||||
fmadds 9,0,11,9
|
||||
fmadds 2,12,0,2
|
||||
#endif
|
||||
bdnz .L8
|
||||
b .L7
|
||||
.p2align 4,,15
|
||||
.L10:
|
||||
xxlxor 1,1,1
|
||||
fmr 2,1
|
||||
blr
|
||||
.L19:
|
||||
addis 8,2,.LANCHOR0@toc@ha
|
||||
sradi. 3,10,1
|
||||
xxspltib 42,0
|
||||
addi 8,8,.LANCHOR0@toc@l
|
||||
lxv 32,0(8)
|
||||
beq 0,.L12
|
||||
xxlor 6,42,42
|
||||
xxlor 4,42,42
|
||||
xxlor 0,42,42
|
||||
xxlor 7,42,42
|
||||
xxlor 5,42,42
|
||||
xxlor 3,42,42
|
||||
xxlor 12,42,42
|
||||
mr 7,4
|
||||
mr 8,6
|
||||
li 5,0
|
||||
.p2align 4,,15
|
||||
.L6:
|
||||
lxv 43,0(8)
|
||||
lxv 44,16(8)
|
||||
addi 5,5,4
|
||||
addi 8,8,64
|
||||
addi 7,7,64
|
||||
lxv 45,-32(8)
|
||||
lxv 33,-16(8)
|
||||
lxv 8,-64(7)
|
||||
lxv 9,-48(7)
|
||||
cmpd 7,3,5
|
||||
lxv 10,-32(7)
|
||||
lxv 11,-16(7)
|
||||
vpermr 6,11,11,0
|
||||
vpermr 7,12,12,0
|
||||
vpermr 8,13,13,0
|
||||
vpermr 9,1,1,0
|
||||
xvmaddasp 12,43,8
|
||||
xvmaddasp 3,44,9
|
||||
xvmaddasp 0,8,38
|
||||
xvmaddasp 4,9,39
|
||||
xvmaddasp 6,10,40
|
||||
xvmaddasp 5,45,10
|
||||
xvmaddasp 42,11,41
|
||||
xvmaddasp 7,33,11
|
||||
bgt 7,.L6
|
||||
xvaddsp 12,12,3
|
||||
xvaddsp 0,0,4
|
||||
xvaddsp 12,12,5
|
||||
xvaddsp 0,0,6
|
||||
xvaddsp 12,12,7
|
||||
xvaddsp 42,0,42
|
||||
.L5:
|
||||
#ifdef CONJ
|
||||
xxpermdi 8,12,12,2
|
||||
xxpermdi 0,42,42,2
|
||||
cmpd 7,9,10
|
||||
sldi 8,10,1
|
||||
xvaddsp 8,8,12
|
||||
xvaddsp 0,0,42
|
||||
xxsldwi 1,8,8,3
|
||||
xxsldwi 12,0,0,3
|
||||
xxsldwi 8,8,8,2
|
||||
xxsldwi 0,0,0,2
|
||||
xscvspdp 1,1
|
||||
xscvspdp 12,12
|
||||
xscvspdp 8,8
|
||||
#else
|
||||
xxpermdi 9,12,12,2
|
||||
xxpermdi 0,42,42,2
|
||||
cmpd 7,9,10
|
||||
sldi 8,10,1
|
||||
xvaddsp 9,9,12
|
||||
xvaddsp 0,0,42
|
||||
xxsldwi 1,9,9,3
|
||||
xxsldwi 2,0,0,3
|
||||
xxsldwi 9,9,9,2
|
||||
xxsldwi 0,0,0,2
|
||||
xscvspdp 8,2
|
||||
xscvspdp 1,1
|
||||
xscvspdp 9,9
|
||||
#endif
|
||||
xscvspdp 2,0
|
||||
bgt 7,.L4
|
||||
b .L7
|
||||
.L12:
|
||||
xxlor 12,42,42
|
||||
b .L5
|
||||
.L16:
|
||||
li 9,1
|
||||
mtctr 9
|
||||
b .L8
|
||||
.long 0
|
||||
.byte 0,0,0,0,0,0,0,0
|
||||
.size cdot_k,.-cdot_k
|
||||
.section .rodata
|
||||
.align 4
|
||||
.set .LANCHOR0,. + 0
|
||||
.type swap_mask_arr, @object
|
||||
.size swap_mask_arr, 16
|
||||
swap_mask_arr:
|
||||
.byte 4
|
||||
.byte 5
|
||||
.byte 6
|
||||
.byte 7
|
||||
.byte 0
|
||||
.byte 1
|
||||
.byte 2
|
||||
.byte 3
|
||||
.byte 12
|
||||
.byte 13
|
||||
.byte 14
|
||||
.byte 15
|
||||
.byte 8
|
||||
.byte 9
|
||||
.byte 10
|
||||
.byte 11
|
||||
.ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]"
|
||||
.section .note.GNU-stack,"",@progbits
|
||||
458
kernel/power/icamax_power8.S
Normal file
458
kernel/power/icamax_power8.S
Normal file
@@ -0,0 +1,458 @@
|
||||
/* .file "icamax.c"
|
||||
.abiversion 2
|
||||
.section ".text"
|
||||
.align 2
|
||||
.p2align 4,,15
|
||||
.globl icamax_k
|
||||
.type icamax_k, @function
|
||||
*/
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
PROLOGUE
|
||||
|
||||
icamax_k:
|
||||
.LCF0:
|
||||
0: addis 2,12,.TOC.-.LCF0@ha
|
||||
addi 2,2,.TOC.-.LCF0@l
|
||||
.localentry icamax_k,.-icamax_k
|
||||
mr. 9,3
|
||||
ble 0,.L25
|
||||
cmpdi 7,5,0
|
||||
li 3,0
|
||||
blelr 7
|
||||
cmpdi 7,5,1
|
||||
beq 7,.L54
|
||||
lfs 11,0(4)
|
||||
lfs 0,4(4)
|
||||
cmpdi 7,9,1
|
||||
fabs 11,11
|
||||
fabs 0,0
|
||||
fadds 11,11,0
|
||||
beq 7,.L29
|
||||
addi 9,9,-1
|
||||
sldi 5,5,3
|
||||
mtctr 9
|
||||
add 4,4,5
|
||||
li 3,0
|
||||
li 9,1
|
||||
.p2align 4,,15
|
||||
.L24:
|
||||
lfs 0,4(4)
|
||||
lfs 12,0(4)
|
||||
add 4,4,5
|
||||
fabs 0,0
|
||||
fabs 12,12
|
||||
fadds 0,0,12
|
||||
fcmpu 7,0,11
|
||||
bng 7,.L23
|
||||
fmr 11,0
|
||||
mr 3,9
|
||||
.L23:
|
||||
addi 9,9,1
|
||||
bdnz .L24
|
||||
.L52:
|
||||
addi 3,3,1
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L25:
|
||||
li 3,0
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L54:
|
||||
rldicr. 8,9,0,58
|
||||
bne 0,.L55
|
||||
addi 7,8,1
|
||||
li 10,0
|
||||
xxlxor 11,11,11
|
||||
cmpd 7,7,9
|
||||
sldi 10,10,2
|
||||
add 4,4,10
|
||||
subf 10,8,9
|
||||
mtctr 10
|
||||
li 3,0
|
||||
bgt 7,.L43
|
||||
li 10,-1
|
||||
rldicr 10,10,0,0
|
||||
cmpd 7,9,10
|
||||
beq 7,.L43
|
||||
.p2align 4,,15
|
||||
.L44:
|
||||
lfs 0,4(4)
|
||||
lfs 12,0(4)
|
||||
addi 4,4,8
|
||||
fabs 0,0
|
||||
fabs 12,12
|
||||
fadds 0,0,12
|
||||
fcmpu 7,0,11
|
||||
bng 7,.L46
|
||||
fmr 11,0
|
||||
mr 3,8
|
||||
.L46:
|
||||
addi 8,8,1
|
||||
bdnz .L44
|
||||
b .L52
|
||||
.p2align 4,,15
|
||||
.L55:
|
||||
li 0,-144
|
||||
std 31,-8(1)
|
||||
addis 5,2,.LC2@toc@ha
|
||||
vspltisw 18,0
|
||||
vspltisw 19,0
|
||||
addis 6,2,.LC3@toc@ha
|
||||
addi 5,5,.LC2@toc@l
|
||||
stvx 24,1,0
|
||||
li 0,-128
|
||||
addi 6,6,.LC3@toc@l
|
||||
xxlor 49,50,50
|
||||
addis 7,2,.LC4@toc@ha
|
||||
lxvd2x 44,0,5
|
||||
addis 10,2,.LC5@toc@ha
|
||||
stvx 25,1,0
|
||||
li 0,-112
|
||||
addi 7,7,.LC4@toc@l
|
||||
lxvd2x 45,0,6
|
||||
addis 5,2,.LC6@toc@ha
|
||||
addis 6,2,.LC7@toc@ha
|
||||
stvx 26,1,0
|
||||
li 0,-96
|
||||
addi 10,10,.LC5@toc@l
|
||||
addi 6,6,.LC7@toc@l
|
||||
addi 5,5,.LC6@toc@l
|
||||
stvx 27,1,0
|
||||
li 0,-80
|
||||
lxvd2x 46,0,10
|
||||
xxpermdi 44,44,44,2
|
||||
mr 10,4
|
||||
lxvd2x 48,0,6
|
||||
lxvd2x 47,0,5
|
||||
xxpermdi 45,45,45,2
|
||||
li 6,0
|
||||
stvx 28,1,0
|
||||
li 0,-64
|
||||
xxlnand 44,44,44
|
||||
xxlnand 45,45,45
|
||||
stvx 29,1,0
|
||||
li 0,-48
|
||||
vspltisw 29,8
|
||||
vadduwm 29,29,29
|
||||
xxpermdi 46,46,46,2
|
||||
stvx 30,1,0
|
||||
li 0,-32
|
||||
xxpermdi 47,47,47,2
|
||||
xxpermdi 48,48,48,2
|
||||
stvx 31,1,0
|
||||
lxvd2x 63,0,7
|
||||
addis 7,2,.LC8@toc@ha
|
||||
addi 7,7,.LC8@toc@l
|
||||
lxvd2x 62,0,7
|
||||
xxpermdi 63,63,63,2
|
||||
.p2align 4,,15
|
||||
.L5:
|
||||
addi 3,10,16
|
||||
addi 5,10,32
|
||||
lxvd2x 34,0,10
|
||||
addi 7,10,64
|
||||
addi 31,10,48
|
||||
addi 12,10,80
|
||||
addi 11,10,96
|
||||
lxvd2x 36,0,3
|
||||
lxvd2x 37,0,5
|
||||
addi 3,10,112
|
||||
addi 5,10,128
|
||||
lxvd2x 38,0,7
|
||||
lxvd2x 7,0,31
|
||||
addi 7,10,160
|
||||
addi 31,10,144
|
||||
lxvd2x 33,0,12
|
||||
lxvd2x 39,0,11
|
||||
addi 12,10,176
|
||||
addi 11,10,192
|
||||
lxvd2x 8,0,3
|
||||
lxvd2x 40,0,5
|
||||
xxpermdi 34,34,34,2
|
||||
addi 3,10,208
|
||||
addi 5,10,224
|
||||
lxvd2x 41,0,7
|
||||
lxvd2x 9,0,31
|
||||
addi 7,10,240
|
||||
lxvd2x 10,0,12
|
||||
lxvd2x 42,0,11
|
||||
xxpermdi 37,37,37,2
|
||||
xxpermdi 36,36,36,2
|
||||
addi 6,6,32
|
||||
lxvd2x 32,0,3
|
||||
lxvd2x 43,0,5
|
||||
xxpermdi 7,7,7,2
|
||||
xxpermdi 38,38,38,2
|
||||
cmpd 7,8,6
|
||||
addi 10,10,256
|
||||
lxvd2x 11,0,7
|
||||
xxpermdi 39,39,39,2
|
||||
xxpermdi 33,33,33,2
|
||||
xxpermdi 40,40,40,2
|
||||
xxpermdi 8,8,8,2
|
||||
xxpermdi 41,41,41,2
|
||||
xxpermdi 9,9,9,2
|
||||
xxpermdi 10,10,10,2
|
||||
xxpermdi 42,42,42,2
|
||||
xxpermdi 43,43,43,2
|
||||
xxpermdi 32,32,32,2
|
||||
xxpermdi 11,11,11,2
|
||||
xvabssp 57,37
|
||||
xvabssp 58,39
|
||||
xvabssp 35,40
|
||||
xvabssp 59,41
|
||||
xvabssp 34,34
|
||||
xvabssp 33,33
|
||||
xvabssp 32,32
|
||||
xvabssp 60,43
|
||||
xvabssp 36,36
|
||||
xvabssp 37,7
|
||||
xvabssp 38,38
|
||||
xvabssp 39,8
|
||||
xvabssp 40,9
|
||||
xvabssp 41,10
|
||||
xvabssp 42,42
|
||||
xvabssp 43,11
|
||||
vperm 24,4,2,12
|
||||
vperm 4,4,2,13
|
||||
vperm 2,5,25,12
|
||||
vperm 5,5,25,13
|
||||
vperm 25,1,6,12
|
||||
vperm 6,1,6,13
|
||||
vperm 1,7,26,12
|
||||
vperm 7,7,26,13
|
||||
vperm 26,8,3,12
|
||||
vperm 8,8,3,13
|
||||
vperm 3,9,27,12
|
||||
vperm 9,9,27,13
|
||||
vperm 27,0,10,12
|
||||
vperm 10,0,10,13
|
||||
vperm 0,11,28,12
|
||||
vperm 11,11,28,13
|
||||
xvaddsp 12,33,39
|
||||
xvaddsp 38,57,38
|
||||
xvaddsp 0,32,43
|
||||
xvaddsp 42,59,42
|
||||
xvaddsp 36,56,36
|
||||
xvaddsp 37,34,37
|
||||
xvaddsp 40,58,40
|
||||
xvaddsp 41,35,41
|
||||
xvcmpgtsp 32,12,38
|
||||
xvcmpgtsp 33,0,42
|
||||
xvcmpgtsp 43,37,36
|
||||
xvcmpgtsp 39,41,40
|
||||
xxsel 12,38,12,32
|
||||
xxsel 38,47,48,32
|
||||
xxsel 0,42,0,33
|
||||
xxsel 42,47,48,33
|
||||
xxsel 37,36,37,43
|
||||
xxsel 43,63,46,43
|
||||
xxsel 41,40,41,39
|
||||
xxsel 39,63,46,39
|
||||
xvcmpgtsp 32,12,37
|
||||
xvcmpgtsp 33,0,41
|
||||
xxsel 12,37,12,32
|
||||
xxsel 43,43,38,32
|
||||
xxsel 0,41,0,33
|
||||
xxsel 33,39,42,33
|
||||
xvcmpgtsp 32,0,12
|
||||
vadduwm 1,1,29
|
||||
xxsel 0,12,0,32
|
||||
xxsel 32,43,33,32
|
||||
xvcmpgtsp 33,0,51
|
||||
vadduwm 0,17,0
|
||||
vadduwm 17,17,30
|
||||
xxsel 50,50,32,33
|
||||
xxsel 51,51,0,33
|
||||
bgt 7,.L5
|
||||
xxsldwi 11,51,51,3
|
||||
xxsldwi 12,51,51,2
|
||||
vspltw 0,18,3
|
||||
xxsldwi 0,51,51,1
|
||||
xscvspdp 11,11
|
||||
xscvspdp 12,12
|
||||
mfvsrwz 6,32
|
||||
vspltw 0,18,2
|
||||
xscvspdp 0,0
|
||||
mfvsrwz 7,50
|
||||
mfvsrwz 5,32
|
||||
vspltw 0,18,0
|
||||
xscvspdp 51,51
|
||||
mfvsrwz 10,32
|
||||
fcmpu 7,11,12
|
||||
rldicl 3,6,0,32
|
||||
fmr 10,0
|
||||
rldicl 11,7,0,32
|
||||
rldicl 31,5,0,32
|
||||
rldicl 0,10,0,32
|
||||
beq 7,.L56
|
||||
bnl 7,.L8
|
||||
fmr 11,12
|
||||
mr 3,31
|
||||
.L8:
|
||||
xscmpudp 7,0,51
|
||||
bne 7,.L11
|
||||
cmplw 7,7,10
|
||||
ble 7,.L12
|
||||
mr 7,10
|
||||
.L12:
|
||||
rldicl 11,7,0,32
|
||||
.L13:
|
||||
fcmpu 7,11,10
|
||||
beq 7,.L57
|
||||
blt 7,.L58
|
||||
.L17:
|
||||
cmpd 7,9,8
|
||||
ble 7,.L19
|
||||
addi 7,8,1
|
||||
sldi 10,8,1
|
||||
cmpd 7,7,9
|
||||
sldi 10,10,2
|
||||
add 4,4,10
|
||||
subf 10,8,9
|
||||
mtctr 10
|
||||
bgt 7,.L37
|
||||
li 10,-1
|
||||
rldicr 10,10,0,0
|
||||
cmpd 7,9,10
|
||||
beq 7,.L37
|
||||
.p2align 4,,15
|
||||
.L21:
|
||||
lfs 0,4(4)
|
||||
lfs 12,0(4)
|
||||
addi 4,4,8
|
||||
fabs 0,0
|
||||
fabs 12,12
|
||||
fadds 0,0,12
|
||||
fcmpu 7,0,11
|
||||
bng 7,.L20
|
||||
fmr 11,0
|
||||
mr 3,8
|
||||
.L20:
|
||||
addi 8,8,1
|
||||
bdnz .L21
|
||||
.L19:
|
||||
li 0,-144
|
||||
ld 31,-8(1)
|
||||
addi 3,3,1
|
||||
lvx 24,1,0
|
||||
li 0,-128
|
||||
lvx 25,1,0
|
||||
li 0,-112
|
||||
lvx 26,1,0
|
||||
li 0,-96
|
||||
lvx 27,1,0
|
||||
li 0,-80
|
||||
lvx 28,1,0
|
||||
li 0,-64
|
||||
lvx 29,1,0
|
||||
li 0,-48
|
||||
lvx 30,1,0
|
||||
li 0,-32
|
||||
lvx 31,1,0
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L56:
|
||||
cmplw 7,6,5
|
||||
ble 7,.L7
|
||||
mr 6,5
|
||||
.L7:
|
||||
rldicl 3,6,0,32
|
||||
b .L8
|
||||
.p2align 4,,15
|
||||
.L29:
|
||||
li 3,1
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L11:
|
||||
bnl 7,.L13
|
||||
xscpsgndp 10,51,51
|
||||
mr 11,0
|
||||
b .L13
|
||||
.p2align 4,,15
|
||||
.L57:
|
||||
cmpd 7,3,11
|
||||
ble 7,.L17
|
||||
mr 3,11
|
||||
b .L17
|
||||
.p2align 4,,15
|
||||
.L58:
|
||||
fmr 11,10
|
||||
mr 3,11
|
||||
b .L17
|
||||
.L43:
|
||||
li 9,1
|
||||
mtctr 9
|
||||
b .L44
|
||||
.L37:
|
||||
li 9,1
|
||||
mtctr 9
|
||||
b .L21
|
||||
.long 0
|
||||
.byte 0,0,0,0,0,1,0,0
|
||||
.size icamax_k,.-icamax_k
|
||||
.section .rodata.cst16,"aM",@progbits,16
|
||||
.align 4
|
||||
.LC2:
|
||||
.byte 0
|
||||
.byte 1
|
||||
.byte 2
|
||||
.byte 3
|
||||
.byte 8
|
||||
.byte 9
|
||||
.byte 10
|
||||
.byte 11
|
||||
.byte 16
|
||||
.byte 17
|
||||
.byte 18
|
||||
.byte 19
|
||||
.byte 24
|
||||
.byte 25
|
||||
.byte 26
|
||||
.byte 27
|
||||
.LC3:
|
||||
.byte 4
|
||||
.byte 5
|
||||
.byte 6
|
||||
.byte 7
|
||||
.byte 12
|
||||
.byte 13
|
||||
.byte 14
|
||||
.byte 15
|
||||
.byte 20
|
||||
.byte 21
|
||||
.byte 22
|
||||
.byte 23
|
||||
.byte 28
|
||||
.byte 29
|
||||
.byte 30
|
||||
.byte 31
|
||||
.LC4:
|
||||
.long 0
|
||||
.long 1
|
||||
.long 2
|
||||
.long 3
|
||||
.LC5:
|
||||
.long 4
|
||||
.long 5
|
||||
.long 6
|
||||
.long 7
|
||||
.LC6:
|
||||
.long 8
|
||||
.long 9
|
||||
.long 10
|
||||
.long 11
|
||||
.LC7:
|
||||
.long 12
|
||||
.long 13
|
||||
.long 14
|
||||
.long 15
|
||||
.LC8:
|
||||
.long 32
|
||||
.long 32
|
||||
.long 32
|
||||
.long 32
|
||||
.ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]"
|
||||
.section .note.GNU-stack,"",@progbits
|
||||
387
kernel/power/icamax_power9.S
Normal file
387
kernel/power/icamax_power9.S
Normal file
@@ -0,0 +1,387 @@
|
||||
.file "icamax.c"
|
||||
.abiversion 2
|
||||
.section ".text"
|
||||
.align 2
|
||||
.p2align 4,,15
|
||||
.globl icamax_k
|
||||
.type icamax_k, @function
|
||||
icamax_k:
|
||||
.LCF0:
|
||||
0: addis 2,12,.TOC.-.LCF0@ha
|
||||
addi 2,2,.TOC.-.LCF0@l
|
||||
.localentry icamax_k,.-icamax_k
|
||||
mr. 9,3
|
||||
ble 0,.L25
|
||||
cmpdi 7,5,0
|
||||
li 3,0
|
||||
blelr 7
|
||||
cmpdi 7,5,1
|
||||
beq 7,.L53
|
||||
lfs 11,0(4)
|
||||
lfs 0,4(4)
|
||||
cmpdi 7,9,1
|
||||
fabs 11,11
|
||||
fabs 0,0
|
||||
fadds 11,11,0
|
||||
beq 7,.L29
|
||||
addi 9,9,-1
|
||||
sldi 5,5,3
|
||||
li 3,0
|
||||
mtctr 9
|
||||
add 4,4,5
|
||||
li 9,1
|
||||
.p2align 4,,15
|
||||
.L24:
|
||||
lfs 0,4(4)
|
||||
lfs 12,0(4)
|
||||
add 4,4,5
|
||||
fabs 0,0
|
||||
fabs 12,12
|
||||
fadds 0,0,12
|
||||
fcmpu 7,0,11
|
||||
bng 7,.L23
|
||||
fmr 11,0
|
||||
mr 3,9
|
||||
.L23:
|
||||
addi 9,9,1
|
||||
bdnz .L24
|
||||
.L51:
|
||||
addi 3,3,1
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L25:
|
||||
li 3,0
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L53:
|
||||
rldicr. 8,9,0,58
|
||||
bne 0,.L54
|
||||
addi 7,8,1
|
||||
li 10,0
|
||||
subf 6,8,9
|
||||
li 3,0
|
||||
xxlxor 11,11,11
|
||||
cmpd 7,7,9
|
||||
sldi 10,10,2
|
||||
mtctr 6
|
||||
add 4,4,10
|
||||
bgt 7,.L43
|
||||
li 10,-1
|
||||
rldicr 10,10,0,0
|
||||
cmpd 7,9,10
|
||||
beq 7,.L43
|
||||
.p2align 4,,15
|
||||
.L44:
|
||||
lfs 0,4(4)
|
||||
lfs 12,0(4)
|
||||
addi 4,4,8
|
||||
fabs 0,0
|
||||
fabs 12,12
|
||||
fadds 0,0,12
|
||||
fcmpu 7,0,11
|
||||
bng 7,.L46
|
||||
fmr 11,0
|
||||
mr 3,8
|
||||
.L46:
|
||||
addi 8,8,1
|
||||
bdnz .L44
|
||||
b .L51
|
||||
.p2align 4,,15
|
||||
.L54:
|
||||
addis 11,2,.LC2@toc@ha
|
||||
addis 3,2,.LC3@toc@ha
|
||||
addis 5,2,.LC6@toc@ha
|
||||
addis 6,2,.LC7@toc@ha
|
||||
xxspltib 47,0
|
||||
addis 7,2,.LC4@toc@ha
|
||||
addis 10,2,.LC5@toc@ha
|
||||
stxv 58,-96(1)
|
||||
stxv 59,-80(1)
|
||||
addi 11,11,.LC2@toc@l
|
||||
addi 3,3,.LC3@toc@l
|
||||
addi 5,5,.LC6@toc@l
|
||||
addi 6,6,.LC7@toc@l
|
||||
stxv 62,-32(1)
|
||||
stxv 63,-16(1)
|
||||
xxspltib 58,16
|
||||
addi 7,7,.LC4@toc@l
|
||||
addi 10,10,.LC5@toc@l
|
||||
xxspltib 59,32
|
||||
lxv 44,0(11)
|
||||
lxv 45,0(3)
|
||||
xxspltib 48,0
|
||||
lxv 62,0(5)
|
||||
xxlor 46,47,47
|
||||
lxv 63,0(6)
|
||||
stxv 60,-64(1)
|
||||
stxv 61,-48(1)
|
||||
lxv 60,0(7)
|
||||
lxv 61,0(10)
|
||||
li 7,0
|
||||
mr 10,4
|
||||
vextsb2w 26,26
|
||||
vextsb2w 27,27
|
||||
stxv 56,-128(1)
|
||||
stxv 57,-112(1)
|
||||
.p2align 4,,15
|
||||
.L5:
|
||||
lxv 0,0(10)
|
||||
addi 7,7,32
|
||||
addi 10,10,256
|
||||
cmpd 7,8,7
|
||||
xvabssp 34,0
|
||||
lxv 0,-240(10)
|
||||
xvabssp 42,0
|
||||
lxv 0,-224(10)
|
||||
xvabssp 49,0
|
||||
lxv 0,-208(10)
|
||||
vpermr 25,10,2,12
|
||||
vpermr 2,10,2,13
|
||||
xvabssp 35,0
|
||||
lxv 0,-192(10)
|
||||
xvaddsp 34,57,34
|
||||
xvabssp 36,0
|
||||
lxv 0,-176(10)
|
||||
vpermr 10,3,17,12
|
||||
vpermr 3,3,17,13
|
||||
xvabssp 33,0
|
||||
lxv 0,-160(10)
|
||||
xvaddsp 10,42,35
|
||||
xvabssp 50,0
|
||||
lxv 0,-144(10)
|
||||
vpermr 17,1,4,12
|
||||
vpermr 4,1,4,13
|
||||
xvabssp 37,0
|
||||
lxv 0,-128(10)
|
||||
xvaddsp 36,49,36
|
||||
xvabssp 38,0
|
||||
lxv 0,-112(10)
|
||||
vpermr 1,5,18,12
|
||||
vpermr 5,5,18,13
|
||||
xvabssp 43,0
|
||||
lxv 0,-96(10)
|
||||
xvaddsp 12,33,37
|
||||
xvabssp 51,0
|
||||
lxv 0,-80(10)
|
||||
vpermr 18,11,6,12
|
||||
vpermr 6,11,6,13
|
||||
xvabssp 39,0
|
||||
lxv 0,-64(10)
|
||||
xvaddsp 38,50,38
|
||||
xvabssp 40,0
|
||||
lxv 0,-48(10)
|
||||
vpermr 11,7,19,12
|
||||
vpermr 7,7,19,13
|
||||
xvabssp 32,0
|
||||
lxv 0,-32(10)
|
||||
xvaddsp 11,43,39
|
||||
xvcmpgtsp 39,10,34
|
||||
xvcmpgtsp 43,12,36
|
||||
xvabssp 56,0
|
||||
lxv 0,-16(10)
|
||||
vpermr 19,0,8,12
|
||||
vpermr 8,0,8,13
|
||||
xxsel 10,34,10,39
|
||||
xxsel 12,36,12,43
|
||||
xxsel 39,60,61,39
|
||||
xxsel 43,62,63,43
|
||||
xvabssp 41,0
|
||||
xvaddsp 40,51,40
|
||||
vpermr 0,9,24,12
|
||||
vpermr 9,9,24,13
|
||||
xvaddsp 0,32,41
|
||||
xvcmpgtsp 41,11,38
|
||||
xvcmpgtsp 32,12,10
|
||||
xvcmpgtsp 42,0,40
|
||||
xxsel 11,38,11,41
|
||||
xxsel 12,10,12,32
|
||||
xxsel 43,39,43,32
|
||||
xxsel 41,60,61,41
|
||||
xxsel 0,40,0,42
|
||||
xxsel 42,62,63,42
|
||||
xvcmpgtsp 33,0,11
|
||||
xxsel 0,11,0,33
|
||||
xxsel 33,41,42,33
|
||||
xvcmpgtsp 32,0,12
|
||||
vadduwm 1,1,26
|
||||
xxsel 0,12,0,32
|
||||
xxsel 32,43,33,32
|
||||
xvcmpgtsp 33,0,48
|
||||
vadduwm 0,14,0
|
||||
vadduwm 14,14,27
|
||||
xxsel 47,47,32,33
|
||||
xxsel 48,48,0,33
|
||||
bgt 7,.L5
|
||||
xxsldwi 11,48,48,3
|
||||
xxsldwi 12,48,48,2
|
||||
li 10,0
|
||||
li 3,12
|
||||
xxsldwi 0,48,48,1
|
||||
xscvspdp 48,48
|
||||
vextuwrx 6,10,15
|
||||
li 10,4
|
||||
xscvspdp 11,11
|
||||
xscvspdp 12,12
|
||||
xscvspdp 0,0
|
||||
vextuwrx 5,10,15
|
||||
li 10,8
|
||||
vextuwrx 7,10,15
|
||||
vextuwrx 10,3,15
|
||||
rldicl 12,5,0,32
|
||||
rldicl 3,6,0,32
|
||||
rldicl 11,7,0,32
|
||||
rldicl 0,10,0,32
|
||||
fcmpu 7,11,12
|
||||
fmr 10,0
|
||||
beq 7,.L55
|
||||
bnl 7,.L8
|
||||
mr 3,12
|
||||
fmr 11,12
|
||||
.L8:
|
||||
xscmpudp 7,0,48
|
||||
bne 7,.L11
|
||||
cmplw 7,7,10
|
||||
ble 7,.L12
|
||||
mr 7,10
|
||||
.L12:
|
||||
rldicl 11,7,0,32
|
||||
.L13:
|
||||
fcmpu 7,11,10
|
||||
beq 7,.L56
|
||||
bnl 7,.L17
|
||||
mr 3,11
|
||||
fmr 11,10
|
||||
.L17:
|
||||
cmpd 7,9,8
|
||||
ble 7,.L19
|
||||
addi 7,8,1
|
||||
sldi 10,8,1
|
||||
subf 6,8,9
|
||||
cmpd 7,7,9
|
||||
sldi 10,10,2
|
||||
mtctr 6
|
||||
add 4,4,10
|
||||
bgt 7,.L37
|
||||
li 10,-1
|
||||
rldicr 10,10,0,0
|
||||
cmpd 7,9,10
|
||||
beq 7,.L37
|
||||
.p2align 4,,15
|
||||
.L21:
|
||||
lfs 0,4(4)
|
||||
lfs 12,0(4)
|
||||
addi 4,4,8
|
||||
fabs 0,0
|
||||
fabs 12,12
|
||||
fadds 0,0,12
|
||||
fcmpu 7,0,11
|
||||
bng 7,.L20
|
||||
fmr 11,0
|
||||
mr 3,8
|
||||
.L20:
|
||||
addi 8,8,1
|
||||
bdnz .L21
|
||||
.L19:
|
||||
lxv 56,-128(1)
|
||||
lxv 57,-112(1)
|
||||
addi 3,3,1
|
||||
lxv 58,-96(1)
|
||||
lxv 59,-80(1)
|
||||
lxv 60,-64(1)
|
||||
lxv 61,-48(1)
|
||||
lxv 62,-32(1)
|
||||
lxv 63,-16(1)
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L55:
|
||||
cmplw 7,6,5
|
||||
ble 7,.L7
|
||||
mr 6,5
|
||||
.L7:
|
||||
rldicl 3,6,0,32
|
||||
b .L8
|
||||
.p2align 4,,15
|
||||
.L29:
|
||||
li 3,1
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L11:
|
||||
bnl 7,.L13
|
||||
mr 11,0
|
||||
xscpsgndp 10,48,48
|
||||
b .L13
|
||||
.p2align 4,,15
|
||||
.L56:
|
||||
cmpd 7,3,11
|
||||
ble 7,.L17
|
||||
mr 3,11
|
||||
b .L17
|
||||
.L37:
|
||||
li 9,1
|
||||
mtctr 9
|
||||
b .L21
|
||||
.L43:
|
||||
li 9,1
|
||||
mtctr 9
|
||||
b .L44
|
||||
.long 0
|
||||
.byte 0,0,0,0,0,0,0,0
|
||||
.size icamax_k,.-icamax_k
|
||||
.section .rodata.cst16,"aM",@progbits,16
|
||||
.align 4
|
||||
.LC2:
|
||||
.byte 0
|
||||
.byte 1
|
||||
.byte 2
|
||||
.byte 3
|
||||
.byte 8
|
||||
.byte 9
|
||||
.byte 10
|
||||
.byte 11
|
||||
.byte 16
|
||||
.byte 17
|
||||
.byte 18
|
||||
.byte 19
|
||||
.byte 24
|
||||
.byte 25
|
||||
.byte 26
|
||||
.byte 27
|
||||
.LC3:
|
||||
.byte 4
|
||||
.byte 5
|
||||
.byte 6
|
||||
.byte 7
|
||||
.byte 12
|
||||
.byte 13
|
||||
.byte 14
|
||||
.byte 15
|
||||
.byte 20
|
||||
.byte 21
|
||||
.byte 22
|
||||
.byte 23
|
||||
.byte 28
|
||||
.byte 29
|
||||
.byte 30
|
||||
.byte 31
|
||||
.LC4:
|
||||
.long 0
|
||||
.long 1
|
||||
.long 2
|
||||
.long 3
|
||||
.LC5:
|
||||
.long 4
|
||||
.long 5
|
||||
.long 6
|
||||
.long 7
|
||||
.LC6:
|
||||
.long 8
|
||||
.long 9
|
||||
.long 10
|
||||
.long 11
|
||||
.LC7:
|
||||
.long 12
|
||||
.long 13
|
||||
.long 14
|
||||
.long 15
|
||||
.ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]"
|
||||
.section .note.GNU-stack,"",@progbits
|
||||
457
kernel/power/icamin_power8.S
Normal file
457
kernel/power/icamin_power8.S
Normal file
@@ -0,0 +1,457 @@
|
||||
/* .file "icamin.c"
|
||||
.abiversion 2
|
||||
.section ".text"
|
||||
.align 2
|
||||
.p2align 4,,15
|
||||
.globl icamin_k
|
||||
.type icamin_k, @function
|
||||
*/
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
PROLOGUE
|
||||
|
||||
.LCF0:
|
||||
0: addis 2,12,.TOC.-.LCF0@ha
|
||||
addi 2,2,.TOC.-.LCF0@l
|
||||
#if _CALL_ELF ==2
|
||||
.localentry icamin_k,.-icamin_k
|
||||
#endif
|
||||
mr. 9,3
|
||||
ble 0,.L25
|
||||
cmpdi 7,5,0
|
||||
li 3,0
|
||||
blelr 7
|
||||
lfs 11,0(4)
|
||||
lfs 0,4(4)
|
||||
cmpdi 7,5,1
|
||||
fabs 11,11
|
||||
fabs 0,0
|
||||
fadds 11,11,0
|
||||
beq 7,.L54
|
||||
cmpdi 7,9,1
|
||||
beq 7,.L29
|
||||
addi 9,9,-1
|
||||
sldi 5,5,3
|
||||
mtctr 9
|
||||
add 4,4,5
|
||||
li 3,0
|
||||
li 9,1
|
||||
.p2align 4,,15
|
||||
.L24:
|
||||
lfs 0,4(4)
|
||||
lfs 12,0(4)
|
||||
add 4,4,5
|
||||
fabs 0,0
|
||||
fabs 12,12
|
||||
fadds 0,0,12
|
||||
fcmpu 7,0,11
|
||||
bnl 7,.L23
|
||||
fmr 11,0
|
||||
mr 3,9
|
||||
.L23:
|
||||
addi 9,9,1
|
||||
bdnz .L24
|
||||
.L52:
|
||||
addi 3,3,1
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L25:
|
||||
li 3,0
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L54:
|
||||
rldicr. 8,9,0,58
|
||||
bne 0,.L55
|
||||
addi 7,8,1
|
||||
li 10,0
|
||||
cmpd 7,7,9
|
||||
sldi 10,10,2
|
||||
add 4,4,10
|
||||
subf 10,8,9
|
||||
mtctr 10
|
||||
li 3,0
|
||||
bgt 7,.L43
|
||||
li 10,-1
|
||||
rldicr 10,10,0,0
|
||||
cmpd 7,9,10
|
||||
beq 7,.L43
|
||||
.p2align 4,,15
|
||||
.L44:
|
||||
lfs 0,0(4)
|
||||
lfs 12,4(4)
|
||||
addi 4,4,8
|
||||
fabs 0,0
|
||||
fabs 12,12
|
||||
fadds 0,0,12
|
||||
fcmpu 7,11,0
|
||||
bng 7,.L46
|
||||
fmr 11,0
|
||||
mr 3,8
|
||||
.L46:
|
||||
addi 8,8,1
|
||||
bdnz .L44
|
||||
b .L52
|
||||
.p2align 4,,15
|
||||
.L55:
|
||||
li 0,-128
|
||||
std 31,-8(1)
|
||||
addis 5,2,.LC2@toc@ha
|
||||
xscvdpspn 11,11
|
||||
vspltisw 19,0
|
||||
addis 6,2,.LC3@toc@ha
|
||||
addi 5,5,.LC2@toc@l
|
||||
stvx 25,1,0
|
||||
li 0,-112
|
||||
addi 6,6,.LC3@toc@l
|
||||
xxlor 50,51,51
|
||||
addis 7,2,.LC4@toc@ha
|
||||
lxvd2x 44,0,5
|
||||
addis 10,2,.LC5@toc@ha
|
||||
stvx 26,1,0
|
||||
li 0,-96
|
||||
addi 7,7,.LC4@toc@l
|
||||
lxvd2x 45,0,6
|
||||
addis 5,2,.LC6@toc@ha
|
||||
addis 6,2,.LC7@toc@ha
|
||||
stvx 27,1,0
|
||||
li 0,-80
|
||||
addi 10,10,.LC5@toc@l
|
||||
xxspltw 5,11,0
|
||||
addi 6,6,.LC7@toc@l
|
||||
addi 5,5,.LC6@toc@l
|
||||
stvx 28,1,0
|
||||
li 0,-64
|
||||
lxvd2x 47,0,10
|
||||
xxpermdi 44,44,44,2
|
||||
mr 10,4
|
||||
lxvd2x 49,0,6
|
||||
lxvd2x 48,0,5
|
||||
xxpermdi 45,45,45,2
|
||||
li 6,0
|
||||
stvx 29,1,0
|
||||
li 0,-48
|
||||
xxlnand 44,44,44
|
||||
xxlnand 45,45,45
|
||||
stvx 30,1,0
|
||||
lxvd2x 62,0,7
|
||||
addis 7,2,.LC8@toc@ha
|
||||
li 0,-32
|
||||
addi 7,7,.LC8@toc@l
|
||||
xxpermdi 47,47,47,2
|
||||
stvx 31,1,0
|
||||
vspltisw 31,8
|
||||
xxpermdi 48,48,48,2
|
||||
lxvd2x 46,0,7
|
||||
vadduwm 31,31,31
|
||||
xxpermdi 49,49,49,2
|
||||
xxpermdi 62,62,62,2
|
||||
.p2align 4,,15
|
||||
.L5:
|
||||
addi 3,10,16
|
||||
addi 5,10,32
|
||||
lxvd2x 34,0,10
|
||||
addi 7,10,64
|
||||
addi 31,10,48
|
||||
addi 12,10,80
|
||||
addi 11,10,96
|
||||
lxvd2x 36,0,3
|
||||
lxvd2x 37,0,5
|
||||
addi 3,10,112
|
||||
addi 5,10,128
|
||||
lxvd2x 38,0,7
|
||||
lxvd2x 6,0,31
|
||||
addi 7,10,160
|
||||
addi 31,10,144
|
||||
lxvd2x 33,0,12
|
||||
lxvd2x 39,0,11
|
||||
addi 12,10,176
|
||||
addi 11,10,192
|
||||
lxvd2x 7,0,3
|
||||
lxvd2x 40,0,5
|
||||
xxpermdi 34,34,34,2
|
||||
addi 3,10,208
|
||||
addi 5,10,224
|
||||
lxvd2x 41,0,7
|
||||
lxvd2x 8,0,31
|
||||
addi 7,10,240
|
||||
lxvd2x 9,0,12
|
||||
lxvd2x 42,0,11
|
||||
xxpermdi 37,37,37,2
|
||||
xxpermdi 36,36,36,2
|
||||
addi 6,6,32
|
||||
lxvd2x 32,0,3
|
||||
lxvd2x 43,0,5
|
||||
xxpermdi 6,6,6,2
|
||||
xxpermdi 38,38,38,2
|
||||
cmpd 7,8,6
|
||||
addi 10,10,256
|
||||
lxvd2x 10,0,7
|
||||
xxpermdi 39,39,39,2
|
||||
xxpermdi 33,33,33,2
|
||||
xxpermdi 40,40,40,2
|
||||
xxpermdi 7,7,7,2
|
||||
xxpermdi 41,41,41,2
|
||||
xxpermdi 8,8,8,2
|
||||
xxpermdi 9,9,9,2
|
||||
xxpermdi 42,42,42,2
|
||||
xxpermdi 43,43,43,2
|
||||
xxpermdi 32,32,32,2
|
||||
xxpermdi 10,10,10,2
|
||||
xvabssp 58,37
|
||||
xvabssp 59,39
|
||||
xvabssp 35,40
|
||||
xvabssp 60,41
|
||||
xvabssp 34,34
|
||||
xvabssp 33,33
|
||||
xvabssp 32,32
|
||||
xvabssp 61,43
|
||||
xvabssp 36,36
|
||||
xvabssp 37,6
|
||||
xvabssp 38,38
|
||||
xvabssp 39,7
|
||||
xvabssp 40,8
|
||||
xvabssp 41,9
|
||||
xvabssp 42,42
|
||||
xvabssp 43,10
|
||||
vperm 25,4,2,12
|
||||
vperm 4,4,2,13
|
||||
vperm 2,5,26,12
|
||||
vperm 5,5,26,13
|
||||
vperm 26,1,6,12
|
||||
vperm 6,1,6,13
|
||||
vperm 1,7,27,12
|
||||
vperm 7,7,27,13
|
||||
vperm 27,8,3,12
|
||||
vperm 8,8,3,13
|
||||
vperm 3,9,28,12
|
||||
vperm 9,9,28,13
|
||||
vperm 28,0,10,12
|
||||
vperm 10,0,10,13
|
||||
vperm 0,11,29,12
|
||||
vperm 11,11,29,13
|
||||
xvaddsp 12,33,39
|
||||
xvaddsp 38,58,38
|
||||
xvaddsp 0,32,43
|
||||
xvaddsp 42,60,42
|
||||
xvaddsp 36,57,36
|
||||
xvaddsp 37,34,37
|
||||
xvaddsp 40,59,40
|
||||
xvaddsp 41,35,41
|
||||
xvcmpgtsp 32,38,12
|
||||
xvcmpgtsp 33,42,0
|
||||
xvcmpgtsp 43,36,37
|
||||
xvcmpgtsp 39,40,41
|
||||
xxsel 12,38,12,32
|
||||
xxsel 38,48,49,32
|
||||
xxsel 0,42,0,33
|
||||
xxsel 42,48,49,33
|
||||
xxsel 37,36,37,43
|
||||
xxsel 43,62,47,43
|
||||
xxsel 41,40,41,39
|
||||
xxsel 39,62,47,39
|
||||
xvcmpgtsp 32,37,12
|
||||
xvcmpgtsp 33,41,0
|
||||
xxsel 12,37,12,32
|
||||
xxsel 43,43,38,32
|
||||
xxsel 0,41,0,33
|
||||
xxsel 33,39,42,33
|
||||
xvcmpgtsp 32,12,0
|
||||
vadduwm 1,1,31
|
||||
xxsel 0,12,0,32
|
||||
xxsel 32,43,33,32
|
||||
xvcmpgtsp 33,5,0
|
||||
vadduwm 0,0,18
|
||||
vadduwm 18,18,14
|
||||
xxsel 51,51,32,33
|
||||
xxsel 5,5,0,33
|
||||
bgt 7,.L5
|
||||
xxsldwi 11,5,5,3
|
||||
xxsldwi 12,5,5,2
|
||||
vspltw 0,19,3
|
||||
xxsldwi 0,5,5,1
|
||||
xscvspdp 11,11
|
||||
xscvspdp 12,12
|
||||
mfvsrwz 6,32
|
||||
vspltw 0,19,2
|
||||
xscvspdp 0,0
|
||||
mfvsrwz 7,51
|
||||
mfvsrwz 5,32
|
||||
vspltw 0,19,0
|
||||
xscvspdp 5,5
|
||||
mfvsrwz 10,32
|
||||
fcmpu 7,11,12
|
||||
rldicl 3,6,0,32
|
||||
fmr 10,0
|
||||
rldicl 11,7,0,32
|
||||
rldicl 31,5,0,32
|
||||
rldicl 0,10,0,32
|
||||
beq 7,.L56
|
||||
bng 7,.L8
|
||||
fmr 11,12
|
||||
mr 3,31
|
||||
.L8:
|
||||
fcmpu 7,0,5
|
||||
bne 7,.L11
|
||||
cmplw 7,7,10
|
||||
ble 7,.L12
|
||||
mr 7,10
|
||||
.L12:
|
||||
rldicl 11,7,0,32
|
||||
.L13:
|
||||
fcmpu 7,11,10
|
||||
beq 7,.L57
|
||||
bgt 7,.L58
|
||||
.L17:
|
||||
cmpd 7,9,8
|
||||
ble 7,.L19
|
||||
addi 7,8,1
|
||||
sldi 10,8,1
|
||||
cmpd 7,7,9
|
||||
sldi 10,10,2
|
||||
add 4,4,10
|
||||
subf 10,8,9
|
||||
mtctr 10
|
||||
bgt 7,.L37
|
||||
li 10,-1
|
||||
rldicr 10,10,0,0
|
||||
cmpd 7,9,10
|
||||
beq 7,.L37
|
||||
.p2align 4,,15
|
||||
.L21:
|
||||
lfs 0,0(4)
|
||||
lfs 12,4(4)
|
||||
addi 4,4,8
|
||||
fabs 0,0
|
||||
fabs 12,12
|
||||
fadds 0,0,12
|
||||
fcmpu 7,11,0
|
||||
bng 7,.L20
|
||||
fmr 11,0
|
||||
mr 3,8
|
||||
.L20:
|
||||
addi 8,8,1
|
||||
bdnz .L21
|
||||
.L19:
|
||||
li 0,-128
|
||||
ld 31,-8(1)
|
||||
addi 3,3,1
|
||||
lvx 25,1,0
|
||||
li 0,-112
|
||||
lvx 26,1,0
|
||||
li 0,-96
|
||||
lvx 27,1,0
|
||||
li 0,-80
|
||||
lvx 28,1,0
|
||||
li 0,-64
|
||||
lvx 29,1,0
|
||||
li 0,-48
|
||||
lvx 30,1,0
|
||||
li 0,-32
|
||||
lvx 31,1,0
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L56:
|
||||
cmplw 7,6,5
|
||||
ble 7,.L7
|
||||
mr 6,5
|
||||
.L7:
|
||||
rldicl 3,6,0,32
|
||||
b .L8
|
||||
.p2align 4,,15
|
||||
.L29:
|
||||
li 3,1
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L11:
|
||||
bng 7,.L13
|
||||
fmr 10,5
|
||||
mr 11,0
|
||||
b .L13
|
||||
.p2align 4,,15
|
||||
.L57:
|
||||
cmpd 7,3,11
|
||||
ble 7,.L17
|
||||
mr 3,11
|
||||
b .L17
|
||||
.p2align 4,,15
|
||||
.L58:
|
||||
fmr 11,10
|
||||
mr 3,11
|
||||
b .L17
|
||||
.L43:
|
||||
li 9,1
|
||||
mtctr 9
|
||||
b .L44
|
||||
.L37:
|
||||
li 9,1
|
||||
mtctr 9
|
||||
b .L21
|
||||
.long 0
|
||||
.byte 0,0,0,0,0,1,0,0
|
||||
#if _CALL_ELF ==2
|
||||
.size icamin_k,.-icamin_k
|
||||
#endif
|
||||
.section .rodata.cst16,"aM",@progbits,16
|
||||
.align 4
|
||||
.LC2:
|
||||
.byte 0
|
||||
.byte 1
|
||||
.byte 2
|
||||
.byte 3
|
||||
.byte 8
|
||||
.byte 9
|
||||
.byte 10
|
||||
.byte 11
|
||||
.byte 16
|
||||
.byte 17
|
||||
.byte 18
|
||||
.byte 19
|
||||
.byte 24
|
||||
.byte 25
|
||||
.byte 26
|
||||
.byte 27
|
||||
.LC3:
|
||||
.byte 4
|
||||
.byte 5
|
||||
.byte 6
|
||||
.byte 7
|
||||
.byte 12
|
||||
.byte 13
|
||||
.byte 14
|
||||
.byte 15
|
||||
.byte 20
|
||||
.byte 21
|
||||
.byte 22
|
||||
.byte 23
|
||||
.byte 28
|
||||
.byte 29
|
||||
.byte 30
|
||||
.byte 31
|
||||
.LC4:
|
||||
.long 0
|
||||
.long 1
|
||||
.long 2
|
||||
.long 3
|
||||
.LC5:
|
||||
.long 4
|
||||
.long 5
|
||||
.long 6
|
||||
.long 7
|
||||
.LC6:
|
||||
.long 8
|
||||
.long 9
|
||||
.long 10
|
||||
.long 11
|
||||
.LC7:
|
||||
.long 12
|
||||
.long 13
|
||||
.long 14
|
||||
.long 15
|
||||
.LC8:
|
||||
.long 32
|
||||
.long 32
|
||||
.long 32
|
||||
.long 32
|
||||
.ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]"
|
||||
.section .note.GNU-stack,"",@progbits
|
||||
385
kernel/power/icamin_power9.S
Normal file
385
kernel/power/icamin_power9.S
Normal file
@@ -0,0 +1,385 @@
|
||||
.file "icamin.c"
|
||||
.abiversion 2
|
||||
.section ".text"
|
||||
.align 2
|
||||
.p2align 4,,15
|
||||
.globl icamin_k
|
||||
.type icamin_k, @function
|
||||
icamin_k:
|
||||
.LCF0:
|
||||
0: addis 2,12,.TOC.-.LCF0@ha
|
||||
addi 2,2,.TOC.-.LCF0@l
|
||||
.localentry icamin_k,.-icamin_k
|
||||
mr. 9,3
|
||||
ble 0,.L25
|
||||
cmpdi 7,5,0
|
||||
li 3,0
|
||||
blelr 7
|
||||
lfs 11,0(4)
|
||||
lfs 0,4(4)
|
||||
cmpdi 7,5,1
|
||||
fabs 11,11
|
||||
fabs 0,0
|
||||
fadds 11,11,0
|
||||
beq 7,.L53
|
||||
cmpdi 7,9,1
|
||||
beq 7,.L29
|
||||
addi 9,9,-1
|
||||
sldi 5,5,3
|
||||
li 3,0
|
||||
mtctr 9
|
||||
add 4,4,5
|
||||
li 9,1
|
||||
.p2align 4,,15
|
||||
.L24:
|
||||
lfs 0,4(4)
|
||||
lfs 12,0(4)
|
||||
add 4,4,5
|
||||
fabs 0,0
|
||||
fabs 12,12
|
||||
fadds 0,0,12
|
||||
fcmpu 7,0,11
|
||||
bnl 7,.L23
|
||||
fmr 11,0
|
||||
mr 3,9
|
||||
.L23:
|
||||
addi 9,9,1
|
||||
bdnz .L24
|
||||
.L51:
|
||||
addi 3,3,1
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L25:
|
||||
li 3,0
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L53:
|
||||
rldicr. 8,9,0,58
|
||||
bne 0,.L54
|
||||
addi 7,8,1
|
||||
li 10,0
|
||||
subf 6,8,9
|
||||
li 3,0
|
||||
cmpd 7,7,9
|
||||
sldi 10,10,2
|
||||
mtctr 6
|
||||
add 4,4,10
|
||||
bgt 7,.L43
|
||||
li 10,-1
|
||||
rldicr 10,10,0,0
|
||||
cmpd 7,9,10
|
||||
beq 7,.L43
|
||||
.p2align 4,,15
|
||||
.L44:
|
||||
lfs 0,0(4)
|
||||
lfs 12,4(4)
|
||||
addi 4,4,8
|
||||
fabs 0,0
|
||||
fabs 12,12
|
||||
fadds 0,0,12
|
||||
fcmpu 7,11,0
|
||||
bng 7,.L46
|
||||
fmr 11,0
|
||||
mr 3,8
|
||||
.L46:
|
||||
addi 8,8,1
|
||||
bdnz .L44
|
||||
b .L51
|
||||
.p2align 4,,15
|
||||
.L54:
|
||||
xscvdpspn 9,11
|
||||
addis 11,2,.LC2@toc@ha
|
||||
addis 3,2,.LC3@toc@ha
|
||||
addis 5,2,.LC6@toc@ha
|
||||
addis 6,2,.LC7@toc@ha
|
||||
addis 7,2,.LC4@toc@ha
|
||||
addis 10,2,.LC5@toc@ha
|
||||
xxspltib 48,0
|
||||
addi 11,11,.LC2@toc@l
|
||||
addi 3,3,.LC3@toc@l
|
||||
addi 5,5,.LC6@toc@l
|
||||
stxv 59,-80(1)
|
||||
addi 6,6,.LC7@toc@l
|
||||
stxv 60,-64(1)
|
||||
stxv 63,-16(1)
|
||||
addi 7,7,.LC4@toc@l
|
||||
xxspltib 59,16
|
||||
lxv 44,0(11)
|
||||
xxspltib 60,32
|
||||
lxv 45,0(3)
|
||||
lxv 63,0(5)
|
||||
xxlor 47,48,48
|
||||
lxv 46,0(6)
|
||||
addi 10,10,.LC5@toc@l
|
||||
stxv 61,-48(1)
|
||||
stxv 62,-32(1)
|
||||
xxspltw 9,9,0
|
||||
lxv 61,0(7)
|
||||
lxv 62,0(10)
|
||||
li 7,0
|
||||
mr 10,4
|
||||
vextsb2w 27,27
|
||||
vextsb2w 28,28
|
||||
stxv 57,-112(1)
|
||||
stxv 58,-96(1)
|
||||
.p2align 4,,15
|
||||
.L5:
|
||||
lxv 0,0(10)
|
||||
addi 7,7,32
|
||||
addi 10,10,256
|
||||
cmpd 7,8,7
|
||||
xvabssp 34,0
|
||||
lxv 0,-240(10)
|
||||
xvabssp 42,0
|
||||
lxv 0,-224(10)
|
||||
xvabssp 49,0
|
||||
lxv 0,-208(10)
|
||||
vpermr 26,10,2,12
|
||||
vpermr 2,10,2,13
|
||||
xvabssp 35,0
|
||||
lxv 0,-192(10)
|
||||
xvaddsp 34,58,34
|
||||
xvabssp 36,0
|
||||
lxv 0,-176(10)
|
||||
vpermr 10,3,17,12
|
||||
vpermr 3,3,17,13
|
||||
xvabssp 33,0
|
||||
lxv 0,-160(10)
|
||||
xvaddsp 10,42,35
|
||||
xvabssp 50,0
|
||||
lxv 0,-144(10)
|
||||
vpermr 17,1,4,12
|
||||
vpermr 4,1,4,13
|
||||
xvabssp 37,0
|
||||
lxv 0,-128(10)
|
||||
xvaddsp 36,49,36
|
||||
xvabssp 38,0
|
||||
lxv 0,-112(10)
|
||||
vpermr 1,5,18,12
|
||||
vpermr 5,5,18,13
|
||||
xvabssp 43,0
|
||||
lxv 0,-96(10)
|
||||
xvaddsp 12,33,37
|
||||
xvabssp 51,0
|
||||
lxv 0,-80(10)
|
||||
vpermr 18,11,6,12
|
||||
vpermr 6,11,6,13
|
||||
xvabssp 39,0
|
||||
lxv 0,-64(10)
|
||||
xvaddsp 38,50,38
|
||||
xvabssp 40,0
|
||||
lxv 0,-48(10)
|
||||
vpermr 11,7,19,12
|
||||
vpermr 7,7,19,13
|
||||
xvabssp 32,0
|
||||
lxv 0,-32(10)
|
||||
xvaddsp 11,43,39
|
||||
xvcmpgtsp 39,34,10
|
||||
xvcmpgtsp 43,36,12
|
||||
xvabssp 57,0
|
||||
lxv 0,-16(10)
|
||||
vpermr 19,0,8,12
|
||||
vpermr 8,0,8,13
|
||||
xxsel 10,34,10,39
|
||||
xxsel 12,36,12,43
|
||||
xxsel 39,61,62,39
|
||||
xxsel 43,63,46,43
|
||||
xvabssp 41,0
|
||||
xvaddsp 40,51,40
|
||||
vpermr 0,9,25,12
|
||||
vpermr 9,9,25,13
|
||||
xvaddsp 0,32,41
|
||||
xvcmpgtsp 41,38,11
|
||||
xvcmpgtsp 32,10,12
|
||||
xvcmpgtsp 42,40,0
|
||||
xxsel 11,38,11,41
|
||||
xxsel 12,10,12,32
|
||||
xxsel 43,39,43,32
|
||||
xxsel 41,61,62,41
|
||||
xxsel 0,40,0,42
|
||||
xxsel 42,63,46,42
|
||||
xvcmpgtsp 33,11,0
|
||||
xxsel 0,11,0,33
|
||||
xxsel 33,41,42,33
|
||||
xvcmpgtsp 32,12,0
|
||||
vadduwm 1,1,27
|
||||
xxsel 0,12,0,32
|
||||
xxsel 32,43,33,32
|
||||
xvcmpgtsp 33,9,0
|
||||
vadduwm 0,0,15
|
||||
vadduwm 15,15,28
|
||||
xxsel 48,48,32,33
|
||||
xxsel 9,9,0,33
|
||||
bgt 7,.L5
|
||||
xxsldwi 11,9,9,3
|
||||
xxsldwi 12,9,9,2
|
||||
li 10,0
|
||||
li 3,12
|
||||
xxsldwi 0,9,9,1
|
||||
xscvspdp 9,9
|
||||
vextuwrx 6,10,16
|
||||
li 10,4
|
||||
xscvspdp 11,11
|
||||
xscvspdp 12,12
|
||||
xscvspdp 0,0
|
||||
vextuwrx 5,10,16
|
||||
li 10,8
|
||||
vextuwrx 7,10,16
|
||||
vextuwrx 10,3,16
|
||||
rldicl 12,5,0,32
|
||||
rldicl 3,6,0,32
|
||||
rldicl 11,7,0,32
|
||||
rldicl 0,10,0,32
|
||||
fcmpu 7,11,12
|
||||
fmr 10,0
|
||||
beq 7,.L55
|
||||
bng 7,.L8
|
||||
mr 3,12
|
||||
fmr 11,12
|
||||
.L8:
|
||||
fcmpu 7,0,9
|
||||
bne 7,.L11
|
||||
cmplw 7,7,10
|
||||
ble 7,.L12
|
||||
mr 7,10
|
||||
.L12:
|
||||
rldicl 11,7,0,32
|
||||
.L13:
|
||||
fcmpu 7,11,10
|
||||
beq 7,.L56
|
||||
bng 7,.L17
|
||||
mr 3,11
|
||||
fmr 11,10
|
||||
.L17:
|
||||
cmpd 7,9,8
|
||||
ble 7,.L19
|
||||
addi 7,8,1
|
||||
sldi 10,8,1
|
||||
subf 6,8,9
|
||||
cmpd 7,7,9
|
||||
sldi 10,10,2
|
||||
mtctr 6
|
||||
add 4,4,10
|
||||
bgt 7,.L37
|
||||
li 10,-1
|
||||
rldicr 10,10,0,0
|
||||
cmpd 7,9,10
|
||||
beq 7,.L37
|
||||
.p2align 4,,15
|
||||
.L21:
|
||||
lfs 0,0(4)
|
||||
lfs 12,4(4)
|
||||
addi 4,4,8
|
||||
fabs 0,0
|
||||
fabs 12,12
|
||||
fadds 0,0,12
|
||||
fcmpu 7,11,0
|
||||
bng 7,.L20
|
||||
fmr 11,0
|
||||
mr 3,8
|
||||
.L20:
|
||||
addi 8,8,1
|
||||
bdnz .L21
|
||||
.L19:
|
||||
lxv 57,-112(1)
|
||||
lxv 58,-96(1)
|
||||
addi 3,3,1
|
||||
lxv 59,-80(1)
|
||||
lxv 60,-64(1)
|
||||
lxv 61,-48(1)
|
||||
lxv 62,-32(1)
|
||||
lxv 63,-16(1)
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L55:
|
||||
cmplw 7,6,5
|
||||
ble 7,.L7
|
||||
mr 6,5
|
||||
.L7:
|
||||
rldicl 3,6,0,32
|
||||
b .L8
|
||||
.p2align 4,,15
|
||||
.L29:
|
||||
li 3,1
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L11:
|
||||
bng 7,.L13
|
||||
mr 11,0
|
||||
fmr 10,9
|
||||
b .L13
|
||||
.p2align 4,,15
|
||||
.L56:
|
||||
cmpd 7,3,11
|
||||
ble 7,.L17
|
||||
mr 3,11
|
||||
b .L17
|
||||
.L37:
|
||||
li 9,1
|
||||
mtctr 9
|
||||
b .L21
|
||||
.L43:
|
||||
li 9,1
|
||||
mtctr 9
|
||||
b .L44
|
||||
.long 0
|
||||
.byte 0,0,0,0,0,0,0,0
|
||||
.size icamin_k,.-icamin_k
|
||||
.section .rodata.cst16,"aM",@progbits,16
|
||||
.align 4
|
||||
.LC2:
|
||||
.byte 0
|
||||
.byte 1
|
||||
.byte 2
|
||||
.byte 3
|
||||
.byte 8
|
||||
.byte 9
|
||||
.byte 10
|
||||
.byte 11
|
||||
.byte 16
|
||||
.byte 17
|
||||
.byte 18
|
||||
.byte 19
|
||||
.byte 24
|
||||
.byte 25
|
||||
.byte 26
|
||||
.byte 27
|
||||
.LC3:
|
||||
.byte 4
|
||||
.byte 5
|
||||
.byte 6
|
||||
.byte 7
|
||||
.byte 12
|
||||
.byte 13
|
||||
.byte 14
|
||||
.byte 15
|
||||
.byte 20
|
||||
.byte 21
|
||||
.byte 22
|
||||
.byte 23
|
||||
.byte 28
|
||||
.byte 29
|
||||
.byte 30
|
||||
.byte 31
|
||||
.LC4:
|
||||
.long 0
|
||||
.long 1
|
||||
.long 2
|
||||
.long 3
|
||||
.LC5:
|
||||
.long 4
|
||||
.long 5
|
||||
.long 6
|
||||
.long 7
|
||||
.LC6:
|
||||
.long 8
|
||||
.long 9
|
||||
.long 10
|
||||
.long 11
|
||||
.LC7:
|
||||
.long 12
|
||||
.long 13
|
||||
.long 14
|
||||
.long 15
|
||||
.ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]"
|
||||
.section .note.GNU-stack,"",@progbits
|
||||
@@ -325,13 +325,14 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if (n1 > 0) {
|
||||
#if defined(_CALL_ELF) && (_CALL_ELF == 2)
|
||||
if (n1 > 0) {
|
||||
|
||||
max = diamax_kernel_32(n1, x, &maxf);
|
||||
|
||||
i = n1;
|
||||
}
|
||||
|
||||
#endif
|
||||
while (i < n) {
|
||||
if (ABS(x[i]) > maxf) {
|
||||
max = i;
|
||||
|
||||
@@ -326,13 +326,14 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
minf = ABS(x[0]); //index's not incremented
|
||||
if (inc_x == 1) {
|
||||
|
||||
#if defined(_CALL_ELF) && (_CALL_ELF == 2)
|
||||
BLASLONG n1 = n & -32;
|
||||
if (n1 > 0) {
|
||||
if (n1 > 0) {
|
||||
|
||||
min = diamin_kernel_32(n1, x, &minf);
|
||||
i = n1;
|
||||
}
|
||||
|
||||
#endif
|
||||
while (i < n) {
|
||||
if (ABS(x[i]) < minf) {
|
||||
min = i;
|
||||
|
||||
437
kernel/power/isamax_power8.S
Normal file
437
kernel/power/isamax_power8.S
Normal file
@@ -0,0 +1,437 @@
|
||||
/* .file "isamax.c"
|
||||
.abiversion 2
|
||||
.section ".text"
|
||||
.align 2
|
||||
.p2align 4,,15
|
||||
.globl isamax_k
|
||||
.type isamax_k, @function
|
||||
*/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
PROLOGUE
|
||||
|
||||
.LCF0:
|
||||
0: addis 2,12,.TOC.-.LCF0@ha
|
||||
addi 2,2,.TOC.-.LCF0@l
|
||||
#if _CALL_ELF ==2
|
||||
.localentry isamax_k,.-isamax_k
|
||||
#endif
|
||||
mr. 11,3
|
||||
ble 0,.L36
|
||||
cmpdi 7,5,0
|
||||
li 3,0
|
||||
blelr 7
|
||||
cmpdi 7,5,1
|
||||
beq 7,.L69
|
||||
rldicr. 7,11,0,61
|
||||
beq 0,.L40
|
||||
sldi 3,5,1
|
||||
xxlxor 0,0,0
|
||||
sldi 6,5,2
|
||||
add 3,3,5
|
||||
sldi 0,5,4
|
||||
sldi 3,3,2
|
||||
sldi 5,5,3
|
||||
mr 9,4
|
||||
li 8,0
|
||||
li 10,0
|
||||
.p2align 4,,15
|
||||
.L31:
|
||||
lfs 12,0(9)
|
||||
fabs 12,12
|
||||
fcmpu 7,12,0
|
||||
bng 7,.L23
|
||||
fmr 0,12
|
||||
mr 8,10
|
||||
.L23:
|
||||
lfsx 12,9,6
|
||||
fabs 12,12
|
||||
fcmpu 7,12,0
|
||||
bng 7,.L25
|
||||
fmr 0,12
|
||||
addi 8,10,1
|
||||
.L25:
|
||||
lfsx 12,9,5
|
||||
fabs 12,12
|
||||
fcmpu 7,12,0
|
||||
bng 7,.L27
|
||||
fmr 0,12
|
||||
addi 8,10,2
|
||||
.L27:
|
||||
lfsx 12,9,3
|
||||
add 9,9,0
|
||||
fabs 12,12
|
||||
fcmpu 7,12,0
|
||||
bng 7,.L29
|
||||
fmr 0,12
|
||||
addi 8,10,3
|
||||
.L29:
|
||||
addi 10,10,4
|
||||
cmpd 7,7,10
|
||||
bgt 7,.L31
|
||||
addi 7,7,-1
|
||||
srdi 7,7,2
|
||||
addi 7,7,1
|
||||
sldi 9,7,2
|
||||
mulld 7,6,7
|
||||
cmpd 7,11,9
|
||||
ble 7,.L67
|
||||
.L22:
|
||||
addi 10,9,1
|
||||
sldi 7,7,2
|
||||
cmpd 7,10,11
|
||||
subf 10,9,11
|
||||
mtctr 10
|
||||
add 4,4,7
|
||||
bgt 7,.L54
|
||||
li 3,-1
|
||||
rldicr 3,3,0,0
|
||||
cmpd 7,11,3
|
||||
beq 7,.L54
|
||||
.p2align 4,,15
|
||||
.L35:
|
||||
lfs 12,0(4)
|
||||
add 4,4,6
|
||||
fabs 12,12
|
||||
fcmpu 7,12,0
|
||||
bng 7,.L33
|
||||
fmr 0,12
|
||||
mr 8,9
|
||||
.L33:
|
||||
addi 9,9,1
|
||||
bdnz .L35
|
||||
.L67:
|
||||
addi 3,8,1
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L36:
|
||||
li 3,0
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L69:
|
||||
rldicr. 10,11,0,57
|
||||
bne 0,.L70
|
||||
addi 7,10,1
|
||||
sldi 9,10,2
|
||||
xxlxor 12,12,12
|
||||
cmpd 7,7,11
|
||||
add 4,4,9
|
||||
subf 9,10,11
|
||||
li 8,0
|
||||
mtctr 9
|
||||
bgt 7,.L60
|
||||
li 3,-1
|
||||
rldicr 3,3,0,0
|
||||
cmpd 7,11,3
|
||||
beq 7,.L60
|
||||
.p2align 4,,15
|
||||
.L61:
|
||||
lfs 0,0(4)
|
||||
addi 4,4,4
|
||||
fabs 0,0
|
||||
fcmpu 7,0,12
|
||||
bng 7,.L63
|
||||
fmr 12,0
|
||||
mr 8,10
|
||||
.L63:
|
||||
addi 10,10,1
|
||||
bdnz .L61
|
||||
b .L67
|
||||
.p2align 4,,15
|
||||
.L70:
|
||||
li 0,-64
|
||||
std 31,-8(1)
|
||||
addis 3,2,.LC2@toc@ha
|
||||
vspltisw 18,0
|
||||
vspltisw 12,0
|
||||
addis 5,2,.LC3@toc@ha
|
||||
addis 6,2,.LC6@toc@ha
|
||||
stvx 29,1,0
|
||||
li 0,-48
|
||||
addis 8,2,.LC7@toc@ha
|
||||
xxlor 35,50,50
|
||||
addi 3,3,.LC2@toc@l
|
||||
addi 5,5,.LC3@toc@l
|
||||
stvx 30,1,0
|
||||
addi 6,6,.LC6@toc@l
|
||||
li 0,-32
|
||||
addi 8,8,.LC7@toc@l
|
||||
lxvd2x 51,0,3
|
||||
lxvd2x 34,0,5
|
||||
addis 7,2,.LC4@toc@ha
|
||||
stvx 31,1,0
|
||||
lxvd2x 47,0,6
|
||||
addis 9,2,.LC5@toc@ha
|
||||
addi 7,7,.LC4@toc@l
|
||||
lxvd2x 48,0,8
|
||||
addi 9,9,.LC5@toc@l
|
||||
vspltisw 17,8
|
||||
vadduwm 17,17,17
|
||||
lxvd2x 36,0,7
|
||||
li 7,0
|
||||
lxvd2x 37,0,9
|
||||
mr 9,4
|
||||
.p2align 4,,15
|
||||
.L5:
|
||||
addi 5,9,16
|
||||
addi 6,9,32
|
||||
lxvd2x 41,0,9
|
||||
vadduwm 31,3,15
|
||||
addi 8,9,64
|
||||
addi 31,9,48
|
||||
addi 12,9,80
|
||||
addi 3,9,96
|
||||
lxvd2x 5,0,5
|
||||
lxvd2x 43,0,6
|
||||
addi 5,9,112
|
||||
addi 6,9,128
|
||||
lxvd2x 1,0,8
|
||||
lxvd2x 9,0,31
|
||||
addi 8,9,160
|
||||
addi 31,9,144
|
||||
lxvd2x 6,0,12
|
||||
lxvd2x 13,0,3
|
||||
addi 12,9,176
|
||||
addi 3,9,192
|
||||
lxvd2x 11,0,5
|
||||
lxvd2x 2,0,6
|
||||
xvabssp 41,41
|
||||
addi 5,9,208
|
||||
addi 6,9,224
|
||||
lxvd2x 3,0,8
|
||||
lxvd2x 7,0,31
|
||||
addi 8,9,240
|
||||
lxvd2x 10,0,12
|
||||
lxvd2x 4,0,3
|
||||
xvabssp 43,43
|
||||
xvabssp 5,5
|
||||
addi 7,7,64
|
||||
lxvd2x 8,0,5
|
||||
lxvd2x 0,0,6
|
||||
xvabssp 9,9
|
||||
xvabssp 1,1
|
||||
cmpd 7,10,7
|
||||
addi 9,9,256
|
||||
lxvd2x 12,0,8
|
||||
xvabssp 6,6
|
||||
xvabssp 13,13
|
||||
xvabssp 11,11
|
||||
xvabssp 2,2
|
||||
xvabssp 7,7
|
||||
xvabssp 3,3
|
||||
xvabssp 10,10
|
||||
xvabssp 4,4
|
||||
xvabssp 8,8
|
||||
xvabssp 0,0
|
||||
xvabssp 12,12
|
||||
xvcmpgtsp 32,5,41
|
||||
xvcmpgtsp 61,9,43
|
||||
xvcmpgtsp 45,6,1
|
||||
xvcmpgtsp 62,11,13
|
||||
xvcmpgtsp 38,7,2
|
||||
xvcmpgtsp 46,10,3
|
||||
xvcmpgtsp 40,8,4
|
||||
xvcmpgtsp 39,12,0
|
||||
xxsel 5,41,5,32
|
||||
xxsel 32,51,34,32
|
||||
xxsel 9,43,9,61
|
||||
xxsel 6,1,6,45
|
||||
xxsel 11,13,11,62
|
||||
xxsel 43,51,34,45
|
||||
xxsel 7,2,7,38
|
||||
xvcmpgtsp 41,9,5
|
||||
xxsel 10,3,10,46
|
||||
xvcmpgtsp 45,11,6
|
||||
xxsel 8,4,8,40
|
||||
xxsel 62,36,37,62
|
||||
xxsel 0,0,12,39
|
||||
xvcmpgtsp 42,10,7
|
||||
xxsel 61,36,37,61
|
||||
xxsel 40,51,34,40
|
||||
xvcmpgtsp 33,0,8
|
||||
xxsel 39,36,37,39
|
||||
xxsel 38,51,34,38
|
||||
xxsel 46,36,37,46
|
||||
xxsel 9,5,9,41
|
||||
xxsel 41,32,61,41
|
||||
xxsel 12,6,11,45
|
||||
xxsel 45,43,62,45
|
||||
xxsel 11,7,10,42
|
||||
xvcmpgtsp 32,12,9
|
||||
vadduwm 13,13,17
|
||||
xxsel 42,38,46,42
|
||||
xxsel 0,8,0,33
|
||||
xxsel 33,40,39,33
|
||||
xvcmpgtsp 43,0,11
|
||||
vadduwm 1,1,17
|
||||
xxsel 12,9,12,32
|
||||
xxsel 32,41,45,32
|
||||
vadduwm 0,3,0
|
||||
vadduwm 3,3,16
|
||||
xxsel 0,11,0,43
|
||||
xxsel 33,42,33,43
|
||||
xvcmpgtsp 45,0,12
|
||||
vadduwm 1,31,1
|
||||
xxsel 0,12,0,45
|
||||
xxsel 32,32,33,45
|
||||
xvcmpgtsp 33,0,44
|
||||
xxsel 50,50,32,33
|
||||
xxsel 44,44,0,33
|
||||
bgt 7,.L5
|
||||
xxsldwi 12,44,44,1
|
||||
xscvspdp 10,44
|
||||
vspltw 0,18,0
|
||||
xxsldwi 0,44,44,3
|
||||
xscvspdp 12,12
|
||||
mfvsrwz 3,50
|
||||
mfvsrwz 6,32
|
||||
vspltw 0,18,3
|
||||
xscvspdp 0,0
|
||||
xxsldwi 44,44,44,2
|
||||
mfvsrwz 7,32
|
||||
vspltw 0,18,2
|
||||
xscvspdp 44,44
|
||||
mfvsrwz 9,32
|
||||
fcmpu 7,12,10
|
||||
rldicl 8,3,0,32
|
||||
rldicl 31,6,0,32
|
||||
fmr 11,0
|
||||
rldicl 0,7,0,32
|
||||
rldicl 5,9,0,32
|
||||
beq 7,.L71
|
||||
bnl 7,.L8
|
||||
fmr 12,10
|
||||
mr 8,31
|
||||
.L8:
|
||||
xscmpudp 7,0,44
|
||||
bne 7,.L11
|
||||
cmplw 7,7,9
|
||||
ble 7,.L12
|
||||
mr 7,9
|
||||
.L12:
|
||||
rldicl 5,7,0,32
|
||||
.L13:
|
||||
fcmpu 7,12,11
|
||||
beq 7,.L72
|
||||
bnl 7,.L17
|
||||
fmr 12,11
|
||||
mr 8,5
|
||||
.L17:
|
||||
cmpd 7,11,10
|
||||
ble 7,.L16
|
||||
addi 7,10,1
|
||||
sldi 9,10,2
|
||||
cmpd 7,7,11
|
||||
add 4,4,9
|
||||
subf 9,10,11
|
||||
mtctr 9
|
||||
bgt 7,.L53
|
||||
li 3,-1
|
||||
rldicr 3,3,0,0
|
||||
cmpd 7,11,3
|
||||
beq 7,.L53
|
||||
.p2align 4,,15
|
||||
.L21:
|
||||
lfs 0,0(4)
|
||||
addi 4,4,4
|
||||
fabs 0,0
|
||||
fcmpu 7,0,12
|
||||
bng 7,.L19
|
||||
fmr 12,0
|
||||
mr 8,10
|
||||
.L19:
|
||||
addi 10,10,1
|
||||
bdnz .L21
|
||||
.L16:
|
||||
li 0,-64
|
||||
ld 31,-8(1)
|
||||
addi 3,8,1
|
||||
lvx 29,1,0
|
||||
li 0,-48
|
||||
lvx 30,1,0
|
||||
li 0,-32
|
||||
lvx 31,1,0
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L71:
|
||||
cmplw 7,3,6
|
||||
ble 7,.L7
|
||||
mr 3,6
|
||||
.L7:
|
||||
rldicl 8,3,0,32
|
||||
b .L8
|
||||
.p2align 4,,15
|
||||
.L40:
|
||||
xxlxor 0,0,0
|
||||
sldi 6,5,2
|
||||
li 8,0
|
||||
li 9,0
|
||||
b .L22
|
||||
.p2align 4,,15
|
||||
.L11:
|
||||
blt 7,.L39
|
||||
mr 5,0
|
||||
b .L13
|
||||
.p2align 4,,15
|
||||
.L72:
|
||||
cmpd 7,8,5
|
||||
ble 7,.L17
|
||||
mr 8,5
|
||||
b .L17
|
||||
.p2align 4,,15
|
||||
.L39:
|
||||
xscpsgndp 11,44,44
|
||||
b .L13
|
||||
.L53:
|
||||
li 9,1
|
||||
mtctr 9
|
||||
b .L21
|
||||
.L54:
|
||||
li 10,1
|
||||
mtctr 10
|
||||
b .L35
|
||||
.L60:
|
||||
li 9,1
|
||||
mtctr 9
|
||||
b .L61
|
||||
.long 0
|
||||
.byte 0,0,0,0,0,1,0,0
|
||||
#if _CALL_ELF ==2
|
||||
.size isamax_k,.-isamax_k
|
||||
#endif
|
||||
.section .rodata.cst16,"aM",@progbits,16
|
||||
.align 4
|
||||
.LC2:
|
||||
.long 0
|
||||
.long 1
|
||||
.long 2
|
||||
.long 3
|
||||
.LC3:
|
||||
.long 4
|
||||
.long 5
|
||||
.long 6
|
||||
.long 7
|
||||
.LC4:
|
||||
.long 8
|
||||
.long 9
|
||||
.long 10
|
||||
.long 11
|
||||
.LC5:
|
||||
.long 12
|
||||
.long 13
|
||||
.long 14
|
||||
.long 15
|
||||
.LC6:
|
||||
.long 32
|
||||
.long 32
|
||||
.long 32
|
||||
.long 32
|
||||
.LC7:
|
||||
.long 64
|
||||
.long 64
|
||||
.long 64
|
||||
.long 64
|
||||
.ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]"
|
||||
.section .note.GNU-stack,"",@progbits
|
||||
397
kernel/power/isamax_power9.S
Normal file
397
kernel/power/isamax_power9.S
Normal file
@@ -0,0 +1,397 @@
|
||||
.file "isamax.c"
|
||||
.abiversion 2
|
||||
.section ".text"
|
||||
.align 2
|
||||
.p2align 4,,15
|
||||
.globl isamax_k
|
||||
.type isamax_k, @function
|
||||
isamax_k:
|
||||
.LCF0:
|
||||
0: addis 2,12,.TOC.-.LCF0@ha
|
||||
addi 2,2,.TOC.-.LCF0@l
|
||||
.localentry isamax_k,.-isamax_k
|
||||
mr. 11,3
|
||||
ble 0,.L36
|
||||
cmpdi 7,5,0
|
||||
li 3,0
|
||||
blelr 7
|
||||
cmpdi 7,5,1
|
||||
beq 7,.L69
|
||||
rldicr. 7,11,0,61
|
||||
beq 0,.L40
|
||||
sldi 10,5,1
|
||||
sldi 6,5,2
|
||||
sldi 0,5,4
|
||||
sldi 3,5,3
|
||||
mr 9,4
|
||||
xxlxor 0,0,0
|
||||
li 8,0
|
||||
add 5,10,5
|
||||
li 10,0
|
||||
sldi 5,5,2
|
||||
.p2align 4,,15
|
||||
.L31:
|
||||
lfs 12,0(9)
|
||||
fabs 12,12
|
||||
fcmpu 7,12,0
|
||||
bng 7,.L23
|
||||
fmr 0,12
|
||||
mr 8,10
|
||||
.L23:
|
||||
lfsx 12,9,6
|
||||
fabs 12,12
|
||||
fcmpu 7,12,0
|
||||
bng 7,.L25
|
||||
fmr 0,12
|
||||
addi 8,10,1
|
||||
.L25:
|
||||
lfsx 12,9,3
|
||||
fabs 12,12
|
||||
fcmpu 7,12,0
|
||||
bng 7,.L27
|
||||
fmr 0,12
|
||||
addi 8,10,2
|
||||
.L27:
|
||||
lfsx 12,9,5
|
||||
add 9,9,0
|
||||
fabs 12,12
|
||||
fcmpu 7,12,0
|
||||
bng 7,.L29
|
||||
fmr 0,12
|
||||
addi 8,10,3
|
||||
.L29:
|
||||
addi 10,10,4
|
||||
cmpd 7,7,10
|
||||
bgt 7,.L31
|
||||
addi 7,7,-1
|
||||
srdi 7,7,2
|
||||
addi 7,7,1
|
||||
sldi 9,7,2
|
||||
mulld 7,6,7
|
||||
cmpd 7,11,9
|
||||
ble 7,.L67
|
||||
.L22:
|
||||
addi 10,9,1
|
||||
sldi 7,7,2
|
||||
subf 5,9,11
|
||||
cmpd 7,10,11
|
||||
mtctr 5
|
||||
add 4,4,7
|
||||
bgt 7,.L54
|
||||
li 3,-1
|
||||
rldicr 3,3,0,0
|
||||
cmpd 7,11,3
|
||||
beq 7,.L54
|
||||
.p2align 4,,15
|
||||
.L35:
|
||||
lfs 12,0(4)
|
||||
add 4,4,6
|
||||
fabs 12,12
|
||||
fcmpu 7,12,0
|
||||
bng 7,.L33
|
||||
fmr 0,12
|
||||
mr 8,9
|
||||
.L33:
|
||||
addi 9,9,1
|
||||
bdnz .L35
|
||||
.L67:
|
||||
addi 3,8,1
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L36:
|
||||
li 3,0
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L69:
|
||||
rldicr. 10,11,0,57
|
||||
bne 0,.L70
|
||||
addi 7,10,1
|
||||
sldi 9,10,2
|
||||
subf 6,10,11
|
||||
li 8,0
|
||||
xxlxor 12,12,12
|
||||
cmpd 7,7,11
|
||||
mtctr 6
|
||||
add 4,4,9
|
||||
bgt 7,.L60
|
||||
li 3,-1
|
||||
rldicr 3,3,0,0
|
||||
cmpd 7,11,3
|
||||
beq 7,.L60
|
||||
.p2align 4,,15
|
||||
.L61:
|
||||
lfs 0,0(4)
|
||||
addi 4,4,4
|
||||
fabs 0,0
|
||||
fcmpu 7,0,12
|
||||
bng 7,.L63
|
||||
fmr 12,0
|
||||
mr 8,10
|
||||
.L63:
|
||||
addi 10,10,1
|
||||
bdnz .L61
|
||||
b .L67
|
||||
.p2align 4,,15
|
||||
.L70:
|
||||
addis 6,2,.LC2@toc@ha
|
||||
addis 7,2,.LC3@toc@ha
|
||||
addis 8,2,.LC4@toc@ha
|
||||
addis 9,2,.LC5@toc@ha
|
||||
xxspltib 46,0
|
||||
stxv 61,-48(1)
|
||||
stxv 62,-32(1)
|
||||
addi 6,6,.LC2@toc@l
|
||||
addi 7,7,.LC3@toc@l
|
||||
stxv 63,-16(1)
|
||||
xxspltib 61,32
|
||||
xxspltib 63,16
|
||||
xxspltib 62,64
|
||||
addi 8,8,.LC4@toc@l
|
||||
addi 9,9,.LC5@toc@l
|
||||
lxv 47,0(6)
|
||||
xxspltib 34,0
|
||||
lxv 48,0(7)
|
||||
xxlor 51,46,46
|
||||
lxv 49,0(8)
|
||||
lxv 50,0(9)
|
||||
li 8,0
|
||||
mr 9,4
|
||||
vextsb2w 29,29
|
||||
vextsb2w 31,31
|
||||
vextsb2w 30,30
|
||||
stxv 59,-80(1)
|
||||
stxv 60,-64(1)
|
||||
.p2align 4,,15
|
||||
.L5:
|
||||
lxv 0,0(9)
|
||||
vadduwm 27,19,29
|
||||
lxv 12,240(9)
|
||||
addi 8,8,64
|
||||
addi 9,9,256
|
||||
cmpd 7,10,8
|
||||
xvabssp 44,0
|
||||
lxv 0,-240(9)
|
||||
xvabssp 12,12
|
||||
xvabssp 5,0
|
||||
lxv 0,-224(9)
|
||||
xvabssp 32,0
|
||||
lxv 0,-208(9)
|
||||
xvcmpgtsp 35,5,44
|
||||
xvabssp 9,0
|
||||
lxv 0,-192(9)
|
||||
xxsel 5,44,5,35
|
||||
xxsel 35,47,48,35
|
||||
xvabssp 1,0
|
||||
lxv 0,-176(9)
|
||||
xvcmpgtsp 60,9,32
|
||||
xvabssp 6,0
|
||||
lxv 0,-160(9)
|
||||
xxsel 9,32,9,60
|
||||
xxsel 60,49,50,60
|
||||
xvabssp 13,0
|
||||
lxv 0,-144(9)
|
||||
xvcmpgtsp 42,9,5
|
||||
xvcmpgtsp 37,6,1
|
||||
xvabssp 11,0
|
||||
lxv 0,-128(9)
|
||||
xxsel 9,5,9,42
|
||||
xxsel 42,35,60,42
|
||||
xxsel 6,1,6,37
|
||||
xxsel 37,47,48,37
|
||||
xvabssp 2,0
|
||||
lxv 0,-112(9)
|
||||
xvcmpgtsp 36,11,13
|
||||
xvabssp 7,0
|
||||
lxv 0,-96(9)
|
||||
xxsel 11,13,11,36
|
||||
xxsel 36,49,50,36
|
||||
xvabssp 3,0
|
||||
lxv 0,-80(9)
|
||||
xvcmpgtsp 45,11,6
|
||||
xvcmpgtsp 39,7,2
|
||||
xvabssp 10,0
|
||||
lxv 0,-64(9)
|
||||
xxsel 7,2,7,39
|
||||
xxsel 39,47,48,39
|
||||
xvabssp 4,0
|
||||
lxv 0,-48(9)
|
||||
xvcmpgtsp 38,10,3
|
||||
xvabssp 8,0
|
||||
lxv 0,-32(9)
|
||||
xxsel 10,3,10,38
|
||||
xxsel 38,49,50,38
|
||||
xvabssp 0,0
|
||||
xvcmpgtsp 43,10,7
|
||||
xvcmpgtsp 41,8,4
|
||||
xvcmpgtsp 40,12,0
|
||||
xxsel 8,4,8,41
|
||||
xxsel 41,47,48,41
|
||||
xxsel 0,0,12,40
|
||||
xxsel 12,6,11,45
|
||||
xxsel 11,7,10,43
|
||||
xxsel 45,37,36,45
|
||||
xvcmpgtsp 33,0,8
|
||||
xvcmpgtsp 32,12,9
|
||||
vadduwm 13,13,31
|
||||
xxsel 40,49,50,40
|
||||
xxsel 43,39,38,43
|
||||
xxsel 0,8,0,33
|
||||
xxsel 12,9,12,32
|
||||
xxsel 33,41,40,33
|
||||
xxsel 32,42,45,32
|
||||
xvcmpgtsp 44,0,11
|
||||
vadduwm 1,1,31
|
||||
vadduwm 0,19,0
|
||||
vadduwm 19,19,30
|
||||
xxsel 0,11,0,44
|
||||
xxsel 33,43,33,44
|
||||
xvcmpgtsp 45,0,12
|
||||
vadduwm 1,27,1
|
||||
xxsel 0,12,0,45
|
||||
xxsel 32,32,33,45
|
||||
xvcmpgtsp 33,0,34
|
||||
xxsel 46,46,32,33
|
||||
xxsel 34,34,0,33
|
||||
bgt 7,.L5
|
||||
xxsldwi 12,34,34,3
|
||||
xxsldwi 11,34,34,2
|
||||
li 9,0
|
||||
li 8,12
|
||||
xxsldwi 0,34,34,1
|
||||
xscvspdp 34,34
|
||||
vextuwrx 3,9,14
|
||||
li 9,4
|
||||
xscvspdp 12,12
|
||||
xscvspdp 11,11
|
||||
xscvspdp 0,0
|
||||
vextuwrx 6,9,14
|
||||
li 9,8
|
||||
vextuwrx 7,9,14
|
||||
vextuwrx 9,8,14
|
||||
rldicl 12,6,0,32
|
||||
rldicl 8,3,0,32
|
||||
rldicl 0,7,0,32
|
||||
rldicl 5,9,0,32
|
||||
fcmpu 7,12,11
|
||||
fmr 10,0
|
||||
beq 7,.L71
|
||||
bnl 7,.L8
|
||||
mr 8,12
|
||||
fmr 12,11
|
||||
.L8:
|
||||
xscmpudp 7,0,34
|
||||
bne 7,.L11
|
||||
cmplw 7,7,9
|
||||
ble 7,.L12
|
||||
mr 7,9
|
||||
.L12:
|
||||
rldicl 5,7,0,32
|
||||
.L13:
|
||||
fcmpu 7,12,10
|
||||
beq 7,.L72
|
||||
bnl 7,.L17
|
||||
mr 8,5
|
||||
fmr 12,10
|
||||
.L17:
|
||||
cmpd 7,11,10
|
||||
ble 7,.L16
|
||||
addi 7,10,1
|
||||
sldi 9,10,2
|
||||
subf 6,10,11
|
||||
cmpd 7,7,11
|
||||
mtctr 6
|
||||
add 4,4,9
|
||||
bgt 7,.L53
|
||||
li 3,-1
|
||||
rldicr 3,3,0,0
|
||||
cmpd 7,11,3
|
||||
beq 7,.L53
|
||||
.p2align 4,,15
|
||||
.L21:
|
||||
lfs 0,0(4)
|
||||
addi 4,4,4
|
||||
fabs 0,0
|
||||
fcmpu 7,0,12
|
||||
bng 7,.L19
|
||||
fmr 12,0
|
||||
mr 8,10
|
||||
.L19:
|
||||
addi 10,10,1
|
||||
bdnz .L21
|
||||
.L16:
|
||||
lxv 59,-80(1)
|
||||
lxv 60,-64(1)
|
||||
addi 3,8,1
|
||||
lxv 61,-48(1)
|
||||
lxv 62,-32(1)
|
||||
lxv 63,-16(1)
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L71:
|
||||
cmplw 7,3,6
|
||||
ble 7,.L7
|
||||
mr 3,6
|
||||
.L7:
|
||||
rldicl 8,3,0,32
|
||||
b .L8
|
||||
.p2align 4,,15
|
||||
.L40:
|
||||
sldi 6,5,2
|
||||
li 8,0
|
||||
li 9,0
|
||||
xxlxor 0,0,0
|
||||
b .L22
|
||||
.p2align 4,,15
|
||||
.L11:
|
||||
blt 7,.L39
|
||||
mr 5,0
|
||||
b .L13
|
||||
.p2align 4,,15
|
||||
.L72:
|
||||
cmpd 7,8,5
|
||||
ble 7,.L17
|
||||
mr 8,5
|
||||
b .L17
|
||||
.p2align 4,,15
|
||||
.L39:
|
||||
xscpsgndp 10,34,34
|
||||
b .L13
|
||||
.L53:
|
||||
li 9,1
|
||||
mtctr 9
|
||||
b .L21
|
||||
.L54:
|
||||
li 10,1
|
||||
mtctr 10
|
||||
b .L35
|
||||
.L60:
|
||||
li 9,1
|
||||
mtctr 9
|
||||
b .L61
|
||||
.long 0
|
||||
.byte 0,0,0,0,0,0,0,0
|
||||
.size isamax_k,.-isamax_k
|
||||
.section .rodata.cst16,"aM",@progbits,16
|
||||
.align 4
|
||||
.LC2:
|
||||
.long 0
|
||||
.long 1
|
||||
.long 2
|
||||
.long 3
|
||||
.LC3:
|
||||
.long 4
|
||||
.long 5
|
||||
.long 6
|
||||
.long 7
|
||||
.LC4:
|
||||
.long 8
|
||||
.long 9
|
||||
.long 10
|
||||
.long 11
|
||||
.LC5:
|
||||
.long 12
|
||||
.long 13
|
||||
.long 14
|
||||
.long 15
|
||||
.ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]"
|
||||
.section .note.GNU-stack,"",@progbits
|
||||
420
kernel/power/isamin_power8.S
Normal file
420
kernel/power/isamin_power8.S
Normal file
@@ -0,0 +1,420 @@
|
||||
/* .file "isamin.c"
|
||||
.abiversion 2
|
||||
.section ".text"
|
||||
.align 2
|
||||
.p2align 4,,15
|
||||
.globl isamin_k
|
||||
.type isamin_k, @function
|
||||
*/
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
PROLOGUE
|
||||
|
||||
.LCF0:
|
||||
0: addis 2,12,.TOC.-.LCF0@ha
|
||||
addi 2,2,.TOC.-.LCF0@l
|
||||
#if _CALL_ELF ==2
|
||||
.localentry isamin_k,.-isamin_k
|
||||
#endif
|
||||
mr. 11,3
|
||||
ble 0,.L36
|
||||
cmpdi 7,5,0
|
||||
li 3,0
|
||||
blelr 7
|
||||
lfs 0,0(4)
|
||||
li 0,-48
|
||||
cmpdi 7,5,1
|
||||
stvx 30,1,0
|
||||
li 0,-32
|
||||
stvx 31,1,0
|
||||
fabs 0,0
|
||||
beq 7,.L62
|
||||
rldicr. 6,11,0,61
|
||||
beq 0,.L40
|
||||
sldi 0,5,1
|
||||
sldi 12,5,2
|
||||
std 31,-8(1)
|
||||
add 0,0,5
|
||||
neg 31,5
|
||||
sldi 3,5,4
|
||||
sldi 0,0,2
|
||||
add 7,4,12
|
||||
sldi 31,31,2
|
||||
sldi 5,5,3
|
||||
li 9,0
|
||||
li 10,0
|
||||
b .L24
|
||||
.p2align 4,,15
|
||||
.L41:
|
||||
mr 10,9
|
||||
.L25:
|
||||
fmr 0,12
|
||||
add 7,7,3
|
||||
.L24:
|
||||
lfs 12,0(7)
|
||||
fabs 12,12
|
||||
fcmpu 7,12,0
|
||||
bnl 7,.L26
|
||||
fmr 0,12
|
||||
addi 10,9,1
|
||||
.L26:
|
||||
add 8,31,7
|
||||
lfsx 12,8,5
|
||||
fabs 12,12
|
||||
fcmpu 7,12,0
|
||||
bnl 7,.L28
|
||||
fmr 0,12
|
||||
addi 10,9,2
|
||||
.L28:
|
||||
lfsx 12,8,0
|
||||
fabs 12,12
|
||||
fcmpu 7,12,0
|
||||
bnl 7,.L30
|
||||
fmr 0,12
|
||||
addi 10,9,3
|
||||
.L30:
|
||||
addi 9,9,4
|
||||
cmpd 7,6,9
|
||||
ble 7,.L63
|
||||
lfsx 12,8,3
|
||||
fabs 12,12
|
||||
fcmpu 7,12,0
|
||||
blt 7,.L41
|
||||
fmr 12,0
|
||||
b .L25
|
||||
.p2align 4,,15
|
||||
.L36:
|
||||
li 3,0
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L63:
|
||||
addi 6,6,-1
|
||||
ld 31,-8(1)
|
||||
srdi 6,6,2
|
||||
addi 6,6,1
|
||||
sldi 9,6,2
|
||||
mulld 6,12,6
|
||||
cmpd 7,11,9
|
||||
ble 7,.L33
|
||||
.L23:
|
||||
addi 8,9,1
|
||||
sldi 6,6,2
|
||||
cmpd 7,8,11
|
||||
subf 8,9,11
|
||||
mtctr 8
|
||||
add 4,4,6
|
||||
bgt 7,.L52
|
||||
li 3,-1
|
||||
rldicr 3,3,0,0
|
||||
cmpd 7,11,3
|
||||
beq 7,.L52
|
||||
.p2align 4,,15
|
||||
.L35:
|
||||
lfs 12,0(4)
|
||||
add 4,4,12
|
||||
fabs 12,12
|
||||
fcmpu 7,12,0
|
||||
bnl 7,.L34
|
||||
fmr 0,12
|
||||
mr 10,9
|
||||
.L34:
|
||||
addi 9,9,1
|
||||
bdnz .L35
|
||||
.L33:
|
||||
li 0,-48
|
||||
addi 3,10,1
|
||||
lvx 30,1,0
|
||||
li 0,-32
|
||||
lvx 31,1,0
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L62:
|
||||
rldicr. 8,11,0,57
|
||||
li 10,0
|
||||
bne 0,.L64
|
||||
.L4:
|
||||
addi 7,8,1
|
||||
sldi 9,8,2
|
||||
cmpd 7,7,11
|
||||
add 4,4,9
|
||||
subf 9,8,11
|
||||
mtctr 9
|
||||
bgt 7,.L51
|
||||
li 3,-1
|
||||
rldicr 3,3,0,0
|
||||
cmpd 7,11,3
|
||||
beq 7,.L51
|
||||
.p2align 4,,15
|
||||
.L22:
|
||||
lfs 12,0(4)
|
||||
addi 4,4,4
|
||||
fabs 12,12
|
||||
fcmpu 7,0,12
|
||||
bng 7,.L21
|
||||
fmr 0,12
|
||||
mr 10,8
|
||||
.L21:
|
||||
addi 8,8,1
|
||||
bdnz .L22
|
||||
li 0,-48
|
||||
addi 3,10,1
|
||||
lvx 30,1,0
|
||||
li 0,-32
|
||||
lvx 31,1,0
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L64:
|
||||
lxvd2x 4,0,4
|
||||
addis 10,2,.LC2@toc@ha
|
||||
addis 5,2,.LC3@toc@ha
|
||||
std 31,-8(1)
|
||||
vspltisw 2,0
|
||||
addi 10,10,.LC2@toc@l
|
||||
addis 7,2,.LC4@toc@ha
|
||||
addis 9,2,.LC5@toc@ha
|
||||
addis 6,2,.LC6@toc@ha
|
||||
lxvd2x 51,0,10
|
||||
addis 10,2,.LC7@toc@ha
|
||||
addi 7,7,.LC4@toc@l
|
||||
addi 9,9,.LC5@toc@l
|
||||
addi 5,5,.LC3@toc@l
|
||||
xvabssp 4,4
|
||||
addi 6,6,.LC6@toc@l
|
||||
addi 10,10,.LC7@toc@l
|
||||
lxvd2x 36,0,7
|
||||
vspltisw 18,8
|
||||
lxvd2x 37,0,9
|
||||
lxvd2x 35,0,5
|
||||
mr 9,4
|
||||
li 7,0
|
||||
lxvd2x 48,0,6
|
||||
lxvd2x 49,0,10
|
||||
vadduwm 18,18,18
|
||||
xxlor 38,51,51
|
||||
xxlor 40,4,4
|
||||
b .L6
|
||||
.p2align 4,,15
|
||||
.L65:
|
||||
lxvd2x 5,0,9
|
||||
xvabssp 40,5
|
||||
.L6:
|
||||
addi 5,9,16
|
||||
addi 6,9,32
|
||||
vadduwm 14,2,16
|
||||
addi 10,9,64
|
||||
addi 12,9,48
|
||||
addi 31,9,80
|
||||
addi 3,9,96
|
||||
lxvd2x 5,0,5
|
||||
lxvd2x 42,0,6
|
||||
addi 5,9,112
|
||||
addi 6,9,128
|
||||
lxvd2x 44,0,10
|
||||
lxvd2x 9,0,12
|
||||
addi 10,9,160
|
||||
addi 12,9,144
|
||||
lxvd2x 6,0,31
|
||||
lxvd2x 1,0,3
|
||||
addi 31,9,176
|
||||
addi 3,9,192
|
||||
lxvd2x 11,0,5
|
||||
lxvd2x 13,0,6
|
||||
addi 5,9,208
|
||||
addi 6,9,224
|
||||
lxvd2x 2,0,10
|
||||
lxvd2x 7,0,12
|
||||
addi 10,9,240
|
||||
lxvd2x 10,0,31
|
||||
lxvd2x 3,0,3
|
||||
xvabssp 42,42
|
||||
xvabssp 5,5
|
||||
addi 7,7,64
|
||||
lxvd2x 8,0,5
|
||||
lxvd2x 0,0,6
|
||||
xvabssp 44,44
|
||||
xvabssp 9,9
|
||||
cmpd 7,8,7
|
||||
addi 9,9,256
|
||||
lxvd2x 12,0,10
|
||||
xvabssp 6,6
|
||||
xvabssp 1,1
|
||||
xvabssp 11,11
|
||||
xvabssp 13,13
|
||||
xvabssp 7,7
|
||||
xvabssp 2,2
|
||||
xvabssp 10,10
|
||||
xvabssp 3,3
|
||||
xvabssp 8,8
|
||||
xvabssp 0,0
|
||||
xvabssp 12,12
|
||||
xvcmpgtsp 32,40,5
|
||||
xvcmpgtsp 62,42,9
|
||||
xvcmpgtsp 45,44,6
|
||||
xvcmpgtsp 63,1,11
|
||||
xvcmpgtsp 39,13,7
|
||||
xvcmpgtsp 47,2,10
|
||||
xvcmpgtsp 41,3,8
|
||||
xvcmpgtsp 33,0,12
|
||||
xxsel 5,40,5,32
|
||||
xxsel 32,38,35,32
|
||||
xxsel 9,42,9,62
|
||||
xxsel 6,44,6,45
|
||||
xxsel 11,1,11,63
|
||||
xxsel 44,38,35,45
|
||||
xxsel 7,13,7,39
|
||||
xvcmpgtsp 42,5,9
|
||||
xxsel 10,2,10,47
|
||||
xvcmpgtsp 45,6,11
|
||||
xxsel 8,3,8,41
|
||||
xxsel 63,36,37,63
|
||||
xxsel 0,0,12,33
|
||||
xvcmpgtsp 43,7,10
|
||||
xxsel 40,36,37,33
|
||||
xxsel 62,36,37,62
|
||||
xvcmpgtsp 33,8,0
|
||||
xxsel 41,38,35,41
|
||||
xxsel 39,38,35,39
|
||||
xxsel 47,36,37,47
|
||||
xxsel 9,5,9,42
|
||||
xxsel 42,32,62,42
|
||||
xxsel 12,6,11,45
|
||||
xxsel 45,44,63,45
|
||||
xxsel 11,7,10,43
|
||||
xvcmpgtsp 32,9,12
|
||||
vadduwm 13,13,18
|
||||
xxsel 43,39,47,43
|
||||
xxsel 0,8,0,33
|
||||
xxsel 33,41,40,33
|
||||
xvcmpgtsp 44,11,0
|
||||
vadduwm 1,1,18
|
||||
xxsel 12,9,12,32
|
||||
xxsel 32,42,45,32
|
||||
vadduwm 0,2,0
|
||||
vadduwm 2,2,17
|
||||
xxsel 0,11,0,44
|
||||
xxsel 33,43,33,44
|
||||
xvcmpgtsp 45,12,0
|
||||
vadduwm 1,14,1
|
||||
xxsel 0,12,0,45
|
||||
xxsel 32,32,33,45
|
||||
xvcmpgtsp 33,4,0
|
||||
xxsel 51,51,32,33
|
||||
xxsel 4,4,0,33
|
||||
bgt 7,.L65
|
||||
xxsldwi 0,4,4,1
|
||||
xscvspdp 10,4
|
||||
vspltw 0,19,0
|
||||
xxsldwi 12,4,4,3
|
||||
xscvspdp 0,0
|
||||
mfvsrwz 3,51
|
||||
mfvsrwz 6,32
|
||||
vspltw 0,19,3
|
||||
xscvspdp 12,12
|
||||
xxsldwi 4,4,4,2
|
||||
mfvsrwz 7,32
|
||||
vspltw 0,19,2
|
||||
xscvspdp 4,4
|
||||
mfvsrwz 9,32
|
||||
fcmpu 7,0,10
|
||||
rldicl 10,3,0,32
|
||||
rldicl 31,6,0,32
|
||||
fmr 11,12
|
||||
rldicl 5,7,0,32
|
||||
rldicl 0,9,0,32
|
||||
beq 7,.L66
|
||||
bng 7,.L9
|
||||
fmr 0,10
|
||||
mr 10,31
|
||||
.L9:
|
||||
fcmpu 7,12,4
|
||||
bne 7,.L12
|
||||
cmplw 7,7,9
|
||||
ble 7,.L13
|
||||
mr 7,9
|
||||
.L13:
|
||||
rldicl 5,7,0,32
|
||||
.L14:
|
||||
fcmpu 7,0,11
|
||||
beq 7,.L67
|
||||
bng 7,.L19
|
||||
fmr 0,11
|
||||
mr 10,5
|
||||
.L19:
|
||||
cmpd 7,11,8
|
||||
ld 31,-8(1)
|
||||
bgt 7,.L4
|
||||
b .L33
|
||||
.p2align 4,,15
|
||||
.L66:
|
||||
cmplw 7,3,6
|
||||
ble 7,.L8
|
||||
mr 3,6
|
||||
.L8:
|
||||
rldicl 10,3,0,32
|
||||
b .L9
|
||||
.p2align 4,,15
|
||||
.L40:
|
||||
sldi 12,5,2
|
||||
li 10,0
|
||||
li 9,0
|
||||
b .L23
|
||||
.p2align 4,,15
|
||||
.L12:
|
||||
bng 7,.L14
|
||||
fmr 11,4
|
||||
mr 5,0
|
||||
b .L14
|
||||
.p2align 4,,15
|
||||
.L67:
|
||||
cmpd 7,10,5
|
||||
ble 7,.L19
|
||||
mr 10,5
|
||||
b .L19
|
||||
.L51:
|
||||
li 9,1
|
||||
mtctr 9
|
||||
b .L22
|
||||
.L52:
|
||||
li 8,1
|
||||
mtctr 8
|
||||
b .L35
|
||||
.long 0
|
||||
.byte 0,0,0,0,0,1,0,0
|
||||
#if _CALL_ELF ==2
|
||||
.size isamin_k,.-isamin_k
|
||||
#endif
|
||||
.section .rodata.cst16,"aM",@progbits,16
|
||||
.align 4
|
||||
.LC2:
|
||||
.long 0
|
||||
.long 1
|
||||
.long 2
|
||||
.long 3
|
||||
.LC3:
|
||||
.long 4
|
||||
.long 5
|
||||
.long 6
|
||||
.long 7
|
||||
.LC4:
|
||||
.long 8
|
||||
.long 9
|
||||
.long 10
|
||||
.long 11
|
||||
.LC5:
|
||||
.long 12
|
||||
.long 13
|
||||
.long 14
|
||||
.long 15
|
||||
.LC6:
|
||||
.long 32
|
||||
.long 32
|
||||
.long 32
|
||||
.long 32
|
||||
.LC7:
|
||||
.long 64
|
||||
.long 64
|
||||
.long 64
|
||||
.long 64
|
||||
.ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]"
|
||||
.section .note.GNU-stack,"",@progbits
|
||||
382
kernel/power/isamin_power9.S
Normal file
382
kernel/power/isamin_power9.S
Normal file
@@ -0,0 +1,382 @@
|
||||
.file "isamin.c"
|
||||
.abiversion 2
|
||||
.section ".text"
|
||||
.align 2
|
||||
.p2align 4,,15
|
||||
.globl isamin_k
|
||||
.type isamin_k, @function
|
||||
isamin_k:
|
||||
.LCF0:
|
||||
0: addis 2,12,.TOC.-.LCF0@ha
|
||||
addi 2,2,.TOC.-.LCF0@l
|
||||
.localentry isamin_k,.-isamin_k
|
||||
mr. 11,3
|
||||
ble 0,.L36
|
||||
cmpdi 7,5,0
|
||||
li 3,0
|
||||
blelr 7
|
||||
lfs 0,0(4)
|
||||
cmpdi 7,5,1
|
||||
stxv 61,-64(1)
|
||||
stxv 62,-48(1)
|
||||
stxv 63,-32(1)
|
||||
fabs 0,0
|
||||
beq 7,.L62
|
||||
rldicr. 6,11,0,61
|
||||
beq 0,.L40
|
||||
sldi 8,5,1
|
||||
sldi 0,5,2
|
||||
neg 12,5
|
||||
std 31,-8(1)
|
||||
sldi 3,5,4
|
||||
sldi 31,5,3
|
||||
li 9,0
|
||||
li 10,0
|
||||
add 5,8,5
|
||||
add 7,4,0
|
||||
sldi 12,12,2
|
||||
sldi 5,5,2
|
||||
b .L24
|
||||
.p2align 4,,15
|
||||
.L41:
|
||||
mr 10,9
|
||||
.L25:
|
||||
add 7,7,3
|
||||
fmr 0,12
|
||||
.L24:
|
||||
lfs 12,0(7)
|
||||
fabs 12,12
|
||||
fcmpu 7,12,0
|
||||
bnl 7,.L26
|
||||
fmr 0,12
|
||||
addi 10,9,1
|
||||
.L26:
|
||||
add 8,7,12
|
||||
lfsx 12,8,31
|
||||
fabs 12,12
|
||||
fcmpu 7,12,0
|
||||
bnl 7,.L28
|
||||
fmr 0,12
|
||||
addi 10,9,2
|
||||
.L28:
|
||||
lfsx 12,8,5
|
||||
fabs 12,12
|
||||
fcmpu 7,12,0
|
||||
bnl 7,.L30
|
||||
fmr 0,12
|
||||
addi 10,9,3
|
||||
.L30:
|
||||
addi 9,9,4
|
||||
cmpd 7,6,9
|
||||
ble 7,.L63
|
||||
lfsx 12,8,3
|
||||
fabs 12,12
|
||||
fcmpu 7,12,0
|
||||
blt 7,.L41
|
||||
fmr 12,0
|
||||
b .L25
|
||||
.p2align 4,,15
|
||||
.L36:
|
||||
li 3,0
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L63:
|
||||
addi 6,6,-1
|
||||
ld 31,-8(1)
|
||||
srdi 6,6,2
|
||||
addi 6,6,1
|
||||
sldi 9,6,2
|
||||
mulld 6,0,6
|
||||
cmpd 7,11,9
|
||||
ble 7,.L33
|
||||
.L23:
|
||||
addi 8,9,1
|
||||
sldi 6,6,2
|
||||
subf 7,9,11
|
||||
cmpd 7,8,11
|
||||
mtctr 7
|
||||
add 4,4,6
|
||||
bgt 7,.L52
|
||||
li 3,-1
|
||||
rldicr 3,3,0,0
|
||||
cmpd 7,11,3
|
||||
beq 7,.L52
|
||||
.p2align 4,,15
|
||||
.L35:
|
||||
lfs 12,0(4)
|
||||
add 4,4,0
|
||||
fabs 12,12
|
||||
fcmpu 7,12,0
|
||||
bnl 7,.L34
|
||||
fmr 0,12
|
||||
mr 10,9
|
||||
.L34:
|
||||
addi 9,9,1
|
||||
bdnz .L35
|
||||
.L33:
|
||||
lxv 61,-64(1)
|
||||
lxv 62,-48(1)
|
||||
addi 3,10,1
|
||||
lxv 63,-32(1)
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L62:
|
||||
rldicr. 8,11,0,57
|
||||
li 10,0
|
||||
bne 0,.L64
|
||||
.L4:
|
||||
addi 7,8,1
|
||||
sldi 9,8,2
|
||||
subf 6,8,11
|
||||
cmpd 7,7,11
|
||||
mtctr 6
|
||||
add 4,4,9
|
||||
bgt 7,.L51
|
||||
li 3,-1
|
||||
rldicr 3,3,0,0
|
||||
cmpd 7,11,3
|
||||
beq 7,.L51
|
||||
.p2align 4,,15
|
||||
.L22:
|
||||
lfs 12,0(4)
|
||||
addi 4,4,4
|
||||
fabs 12,12
|
||||
fcmpu 7,0,12
|
||||
bng 7,.L21
|
||||
fmr 0,12
|
||||
mr 10,8
|
||||
.L21:
|
||||
addi 8,8,1
|
||||
bdnz .L22
|
||||
lxv 61,-64(1)
|
||||
lxv 62,-48(1)
|
||||
addi 3,10,1
|
||||
lxv 63,-32(1)
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L64:
|
||||
lxv 0,0(4)
|
||||
xxspltib 47,16
|
||||
addis 6,2,.LC2@toc@ha
|
||||
addis 7,2,.LC3@toc@ha
|
||||
addis 10,2,.LC4@toc@ha
|
||||
addis 9,2,.LC5@toc@ha
|
||||
xxspltib 63,32
|
||||
xxspltib 46,64
|
||||
addi 6,6,.LC2@toc@l
|
||||
addi 10,10,.LC4@toc@l
|
||||
addi 7,7,.LC3@toc@l
|
||||
std 31,-8(1)
|
||||
addi 9,9,.LC5@toc@l
|
||||
xxspltib 50,0
|
||||
vextsb2w 15,15
|
||||
lxv 48,0(6)
|
||||
lxv 51,0(10)
|
||||
vextsb2w 31,31
|
||||
vextsb2w 14,14
|
||||
xvabssp 4,0
|
||||
lxv 34,0(9)
|
||||
lxv 49,0(7)
|
||||
mr 9,4
|
||||
li 10,0
|
||||
xxlor 35,48,48
|
||||
xxlor 40,4,4
|
||||
b .L6
|
||||
.p2align 4,,15
|
||||
.L65:
|
||||
lxv 0,0(9)
|
||||
xvabssp 40,0
|
||||
.L6:
|
||||
lxv 0,16(9)
|
||||
vadduwm 29,18,31
|
||||
lxv 12,240(9)
|
||||
addi 10,10,64
|
||||
addi 9,9,256
|
||||
cmpd 7,8,10
|
||||
xvabssp 5,0
|
||||
lxv 0,-224(9)
|
||||
xvabssp 12,12
|
||||
xvabssp 32,0
|
||||
lxv 0,-208(9)
|
||||
xvcmpgtsp 42,40,5
|
||||
xvabssp 9,0
|
||||
lxv 0,-192(9)
|
||||
xxsel 5,40,5,42
|
||||
xvabssp 44,0
|
||||
lxv 0,-176(9)
|
||||
xvcmpgtsp 62,32,9
|
||||
xvabssp 6,0
|
||||
lxv 0,-160(9)
|
||||
xxsel 9,32,9,62
|
||||
xxsel 32,35,49,42
|
||||
xvabssp 1,0
|
||||
lxv 0,-144(9)
|
||||
xxsel 62,51,34,62
|
||||
xvcmpgtsp 42,5,9
|
||||
xvcmpgtsp 37,44,6
|
||||
xvabssp 11,0
|
||||
lxv 0,-128(9)
|
||||
xxsel 9,5,9,42
|
||||
xxsel 42,32,62,42
|
||||
xxsel 6,44,6,37
|
||||
xxsel 37,35,49,37
|
||||
xvabssp 13,0
|
||||
lxv 0,-112(9)
|
||||
xvcmpgtsp 36,1,11
|
||||
xvabssp 7,0
|
||||
lxv 0,-96(9)
|
||||
xxsel 11,1,11,36
|
||||
xxsel 36,51,34,36
|
||||
xvabssp 2,0
|
||||
lxv 0,-80(9)
|
||||
xvcmpgtsp 45,6,11
|
||||
xvcmpgtsp 39,13,7
|
||||
xvabssp 10,0
|
||||
lxv 0,-64(9)
|
||||
xxsel 7,13,7,39
|
||||
xxsel 39,35,49,39
|
||||
xvabssp 3,0
|
||||
lxv 0,-48(9)
|
||||
xvcmpgtsp 38,2,10
|
||||
xvabssp 8,0
|
||||
lxv 0,-32(9)
|
||||
xxsel 10,2,10,38
|
||||
xxsel 38,51,34,38
|
||||
xvabssp 0,0
|
||||
xvcmpgtsp 43,7,10
|
||||
xvcmpgtsp 41,3,8
|
||||
xvcmpgtsp 33,0,12
|
||||
xxsel 8,3,8,41
|
||||
xxsel 41,35,49,41
|
||||
xxsel 0,0,12,33
|
||||
xxsel 40,51,34,33
|
||||
xxsel 12,6,11,45
|
||||
xxsel 11,7,10,43
|
||||
xvcmpgtsp 33,8,0
|
||||
xxsel 45,37,36,45
|
||||
xvcmpgtsp 32,9,12
|
||||
xxsel 43,39,38,43
|
||||
vadduwm 13,13,15
|
||||
xxsel 0,8,0,33
|
||||
xxsel 33,41,40,33
|
||||
xxsel 12,9,12,32
|
||||
xxsel 32,42,45,32
|
||||
xvcmpgtsp 44,11,0
|
||||
vadduwm 1,1,15
|
||||
vadduwm 0,18,0
|
||||
vadduwm 18,18,14
|
||||
xxsel 0,11,0,44
|
||||
xxsel 33,43,33,44
|
||||
xvcmpgtsp 45,12,0
|
||||
vadduwm 1,29,1
|
||||
xxsel 0,12,0,45
|
||||
xxsel 32,32,33,45
|
||||
xvcmpgtsp 33,4,0
|
||||
xxsel 48,48,32,33
|
||||
xxsel 4,4,0,33
|
||||
bgt 7,.L65
|
||||
xxsldwi 0,4,4,3
|
||||
xxsldwi 11,4,4,2
|
||||
li 9,0
|
||||
li 10,12
|
||||
xxsldwi 12,4,4,1
|
||||
xscvspdp 4,4
|
||||
vextuwrx 3,9,16
|
||||
li 9,4
|
||||
xscvspdp 0,0
|
||||
xscvspdp 11,11
|
||||
xscvspdp 12,12
|
||||
vextuwrx 6,9,16
|
||||
li 9,8
|
||||
vextuwrx 7,9,16
|
||||
vextuwrx 9,10,16
|
||||
rldicl 31,6,0,32
|
||||
rldicl 10,3,0,32
|
||||
rldicl 5,7,0,32
|
||||
rldicl 0,9,0,32
|
||||
fcmpu 7,0,11
|
||||
fmr 10,12
|
||||
beq 7,.L66
|
||||
bng 7,.L9
|
||||
mr 10,31
|
||||
fmr 0,11
|
||||
.L9:
|
||||
fcmpu 7,12,4
|
||||
bne 7,.L12
|
||||
cmplw 7,7,9
|
||||
ble 7,.L13
|
||||
mr 7,9
|
||||
.L13:
|
||||
rldicl 5,7,0,32
|
||||
.L14:
|
||||
fcmpu 7,0,10
|
||||
beq 7,.L67
|
||||
bng 7,.L19
|
||||
mr 10,5
|
||||
fmr 0,10
|
||||
.L19:
|
||||
cmpd 7,11,8
|
||||
ld 31,-8(1)
|
||||
bgt 7,.L4
|
||||
b .L33
|
||||
.p2align 4,,15
|
||||
.L66:
|
||||
cmplw 7,3,6
|
||||
ble 7,.L8
|
||||
mr 3,6
|
||||
.L8:
|
||||
rldicl 10,3,0,32
|
||||
b .L9
|
||||
.p2align 4,,15
|
||||
.L40:
|
||||
sldi 0,5,2
|
||||
li 10,0
|
||||
li 9,0
|
||||
b .L23
|
||||
.p2align 4,,15
|
||||
.L12:
|
||||
bng 7,.L14
|
||||
mr 5,0
|
||||
fmr 10,4
|
||||
b .L14
|
||||
.p2align 4,,15
|
||||
.L67:
|
||||
cmpd 7,10,5
|
||||
ble 7,.L19
|
||||
mr 10,5
|
||||
b .L19
|
||||
.L51:
|
||||
li 9,1
|
||||
mtctr 9
|
||||
b .L22
|
||||
.L52:
|
||||
li 8,1
|
||||
mtctr 8
|
||||
b .L35
|
||||
.long 0
|
||||
.byte 0,0,0,0,0,1,0,0
|
||||
.size isamin_k,.-isamin_k
|
||||
.section .rodata.cst16,"aM",@progbits,16
|
||||
.align 4
|
||||
.LC2:
|
||||
.long 0
|
||||
.long 1
|
||||
.long 2
|
||||
.long 3
|
||||
.LC3:
|
||||
.long 4
|
||||
.long 5
|
||||
.long 6
|
||||
.long 7
|
||||
.LC4:
|
||||
.long 8
|
||||
.long 9
|
||||
.long 10
|
||||
.long 11
|
||||
.LC5:
|
||||
.long 12
|
||||
.long 13
|
||||
.long 14
|
||||
.long 15
|
||||
.ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]"
|
||||
.section .note.GNU-stack,"",@progbits
|
||||
@@ -316,6 +316,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
#if defined(_CALL_ELF) && (_CALL_ELF == 2)
|
||||
BLASLONG n1 = n & -16;
|
||||
if (n1 > 0) {
|
||||
|
||||
@@ -323,6 +324,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
i = n1;
|
||||
ix = n1 << 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
@@ -314,14 +314,16 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
|
||||
if (inc_x == 1) {
|
||||
minf = CABS1(x,0); //index will not be incremented
|
||||
BLASLONG n1 = n & -16;
|
||||
|
||||
#if defined(_CALL_ELF) && (_CALL_ELF == 2)
|
||||
BLASLONG n1 = n & -16;
|
||||
if (n1 > 0) {
|
||||
|
||||
min = ziamin_kernel_16_TUNED(n1, x, &minf);
|
||||
i = n1;
|
||||
ix = n1 << 1;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
@@ -136,8 +136,8 @@ LSGEMM_L8x16_BEGIN:
|
||||
#endif
|
||||
|
||||
ZERO8x16
|
||||
mtctr L
|
||||
ble LSGEMM_L8x16_SUB0
|
||||
mtctr L
|
||||
bl LSGEMM_L8x16_LMAIN_SUB
|
||||
andi. L, T12, 127
|
||||
ble LSGEMM_L8x16_SAVE
|
||||
@@ -146,7 +146,7 @@ LSGEMM_L8x16_BEGIN:
|
||||
LSGEMM_L8x16_SUB0:
|
||||
#if defined(TRMMKERNEL)
|
||||
andi. L, T11, 255
|
||||
cmpwi T11,128
|
||||
cmpwi T11,129
|
||||
#else
|
||||
andi. L, K, 255
|
||||
cmpwi K,129
|
||||
|
||||
@@ -1,18 +1,19 @@
|
||||
include $(KERNELDIR)/KERNEL.HASWELL
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_16x4_skylakex.c
|
||||
SGEMMKERNEL = sgemm_kernel_16x4_skylakex_2.c
|
||||
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||
SGEMMITCOPY = sgemm_tcopy_16_skylakex.c
|
||||
SGEMMONCOPY = sgemm_ncopy_4_skylakex.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
|
||||
#DGEMMKERNEL = dgemm_kernel_4x8_skylakex.c
|
||||
DGEMMKERNEL = dgemm_kernel_4x8_skylakex_2.c
|
||||
|
||||
#DGEMMINCOPY = dgemm_ncopy_8_skylakex.c
|
||||
#DGEMMITCOPY = dgemm_tcopy_8_skylakex.c
|
||||
DGEMMONCOPY = dgemm_ncopy_8_skylakex.c
|
||||
DGEMMOTCOPY = dgemm_tcopy_8_skylakex.c
|
||||
|
||||
SGEMM_BETA = sgemm_beta_skylakex.c
|
||||
DGEMM_BETA = dgemm_beta_skylakex.c
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_8x2_skylakex.c
|
||||
ZGEMMKERNEL = zgemm_kernel_4x2_skylakex.c
|
||||
|
||||
352
kernel/x86_64/cgemm_kernel_8x2_skylakex.c
Normal file
352
kernel/x86_64/cgemm_kernel_8x2_skylakex.c
Normal file
@@ -0,0 +1,352 @@
|
||||
#include <stdint.h>
|
||||
#include "common.h"
|
||||
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
#define CGEMM_SKX_MODE 0 //not to do conjugation on a_block and b_block
|
||||
#endif
|
||||
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||
#define CGEMM_SKX_MODE 1 //do conjugation on a_block, not b_block
|
||||
#endif
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||
#define CGEMM_SKX_MODE 2 //do conjugation on b_block, not a_block
|
||||
#endif
|
||||
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
#define CGEMM_SKX_MODE 3 //do conjugation on a_block and b_block
|
||||
#endif
|
||||
|
||||
// recommended settings: GEMM_DEFAULT_Q = 192, GEMM_DEFAULT_P = 384
|
||||
/* %0=a_pointer, %1=b_pointer, %2=c_pointer, %3=c_store, %4=ldc(bytes), %5=&constval, %6 = k_counter, %7 = m_counter, %8 = b_pref */
|
||||
// const float constval[4] = {alpha_r, alpha_i, -1, 1};
|
||||
/* r11 = m; r12 = k * 16; r13 = k; r14 = b_head; r15 = %1 + r12 * 3; */
|
||||
#define GENERAL_INIT "movq %7,%%r11; movq %1,%%r14; movq %6,%%r13; movq %6,%%r12; salq $4,%%r12;"
|
||||
#define GENERAL_RECOVER "movq %%r11,%7; movq %%r13,%6; movq %%r14,%1;"
|
||||
#define CONSTZMM_INIT "vbroadcastss (%5),%%zmm0; vbroadcastss 4(%5),%%zmm1; vbroadcastsd 8(%5),%%zmm2;"
|
||||
#define COMPUTE_INIT "movq %%r13,%6; movq %%r14,%1; leaq (%%r14,%%r12,2),%%r15; addq %%r12,%%r15;"
|
||||
|
||||
/* m=8, zmm0=alpha_r, zmm1=alpha_i, zmm2={-1,1,...,-1,1}, zmm3-zmm7 for temporary use, zmm8-zmm31 for accumulators */
|
||||
#if CGEMM_SKX_MODE == 0 || CGEMM_SKX_MODE == 2 //not to do conjugation on a_block
|
||||
#define unit_kernel_k1m8n1(a_r,a_i,b_off,c_le,c_ri,...) \
|
||||
"vbroadcastsd "#b_off"("#__VA_ARGS__"),%%zmm3; vfmadd231ps "#a_r",%%zmm3,"#c_le"; vfmadd231ps "#a_i",%%zmm3,"#c_ri";"
|
||||
#else //do conjugation on a_block
|
||||
#define unit_kernel_k1m8n1(a_r,a_i,b_off,c_le,c_ri,...) \
|
||||
"vbroadcastsd "#b_off"("#__VA_ARGS__"),%%zmm3; vfmadd231ps "#a_r",%%zmm3,"#c_le"; vfnmadd231ps "#a_i",%%zmm3,"#c_ri";"
|
||||
#endif
|
||||
#define KERNEL_h_k1m8n1 \
|
||||
"vmovsldup (%0),%%zmm4; vmovshdup (%0),%%zmm5; prefetcht0 512(%0); addq $64,%0;"\
|
||||
unit_kernel_k1m8n1(%%zmm4,%%zmm5,0,%%zmm8,%%zmm9,%1)
|
||||
#define KERNEL_t_k1m8n1 KERNEL_h_k1m8n1 "addq $8,%1;"
|
||||
#define KERNEL_h_k1m8n2 KERNEL_h_k1m8n1 unit_kernel_k1m8n1(%%zmm4,%%zmm5,8,%%zmm10,%%zmm11,%1)
|
||||
#define KERNEL_t_k1m8n2 KERNEL_h_k1m8n2 "addq $16,%1;"
|
||||
#define unit_kernel_k1m8n2(c1le,c1ri,c2le,c2ri,...) \
|
||||
unit_kernel_k1m8n1(%%zmm4,%%zmm5,0,c1le,c1ri,__VA_ARGS__)\
|
||||
unit_kernel_k1m8n1(%%zmm4,%%zmm5,8,c2le,c2ri,__VA_ARGS__)
|
||||
#define KERNEL_h_k1m8n4 KERNEL_h_k1m8n2 unit_kernel_k1m8n2(%%zmm12,%%zmm13,%%zmm14,%%zmm15,%1,%%r12,1)
|
||||
#define KERNEL_t_k1m8n4 KERNEL_h_k1m8n4 "addq $16,%1;"
|
||||
#define KERNEL_t_k1m8n6 KERNEL_h_k1m8n4 unit_kernel_k1m8n2(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%1,%%r12,2) "addq $16,%1;"
|
||||
#define KERNEL_h_k1m8n8 KERNEL_t_k1m8n6 unit_kernel_k1m8n2(%%zmm20,%%zmm21,%%zmm22,%%zmm23,%%r15)
|
||||
#define KERNEL_t_k1m8n8 KERNEL_h_k1m8n8 "addq $16,%%r15;"
|
||||
#define KERNEL_h_k1m8n10 KERNEL_h_k1m8n8 unit_kernel_k1m8n2(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%r15,%%r12,1)
|
||||
#define KERNEL_t_k1m8n10 KERNEL_h_k1m8n10 "addq $16,%%r15;"
|
||||
#define KERNEL_h_k1m8n12 KERNEL_h_k1m8n10 unit_kernel_k1m8n2(%%zmm28,%%zmm29,%%zmm30,%%zmm31,%%r15,%%r12,2)
|
||||
#define KERNEL_t_k1m8n12 KERNEL_h_k1m8n12 "addq $16,%%r15;"
|
||||
#if CGEMM_SKX_MODE == 0 || CGEMM_SKX_MODE == 1 //not to do conjugation on b_block
|
||||
#define unit_save_m8n1(c_le,c_ri,...) \
|
||||
"vpermilps $177,"#c_ri","#c_ri"; vfmadd231ps "#c_ri",%%zmm2,"#c_le"; vpermilps $177,"#c_le",%%zmm4;"\
|
||||
"vfmaddsub213ps ("#__VA_ARGS__"),%%zmm1,%%zmm4; vfmaddsub213ps %%zmm4,%%zmm0,"#c_le"; vmovups "#c_le",("#__VA_ARGS__");"
|
||||
#else //do conjugation on b_block
|
||||
#define unit_save_m8n1(c_le,c_ri,...) \
|
||||
"vpermilps $177,"#c_ri","#c_ri"; vfnmadd231ps "#c_ri",%%zmm2,"#c_le"; vpermilps $177,"#c_le",%%zmm4;"\
|
||||
"vfmsubadd213ps ("#__VA_ARGS__"),%%zmm0,"#c_le"; vfmsubadd231ps %%zmm4,%%zmm1,"#c_le"; vmovups "#c_le",("#__VA_ARGS__");"
|
||||
#endif
|
||||
#define SAVE_SETUP_m8 "movq %2,%3; addq $64,%2;"
|
||||
#define SAVE_m8n1 SAVE_SETUP_m8 unit_save_m8n1(%%zmm8,%%zmm9,%3)
|
||||
#define SAVE_m8n2 SAVE_m8n1 unit_save_m8n1(%%zmm10,%%zmm11,%3,%4,1)
|
||||
#define unit_save_m8n2(c1le,c1ri,c2le,c2ri) \
|
||||
"leaq (%3,%4,2),%3;" unit_save_m8n1(c1le,c1ri,%3) unit_save_m8n1(c2le,c2ri,%3,%4,1)
|
||||
#define SAVE_m8n4 SAVE_m8n2 unit_save_m8n2(%%zmm12,%%zmm13,%%zmm14,%%zmm15)
|
||||
#define SAVE_m8n6 SAVE_m8n4 unit_save_m8n2(%%zmm16,%%zmm17,%%zmm18,%%zmm19)
|
||||
#define SAVE_m8n8 SAVE_m8n6 unit_save_m8n2(%%zmm20,%%zmm21,%%zmm22,%%zmm23)
|
||||
#define SAVE_m8n10 SAVE_m8n8 unit_save_m8n2(%%zmm24,%%zmm25,%%zmm26,%%zmm27)
|
||||
#define SAVE_m8n12 SAVE_m8n10 unit_save_m8n2(%%zmm28,%%zmm29,%%zmm30,%%zmm31)
|
||||
#define unit_init_m8n1(c_le,c_ri) "vpxorq "#c_le","#c_le","#c_le"; vpxorq "#c_ri","#c_ri","#c_ri";"
|
||||
#define INIT_m8n1 unit_init_m8n1(%%zmm8,%%zmm9)
|
||||
#define INIT_m8n2 INIT_m8n1 unit_init_m8n1(%%zmm10,%%zmm11)
|
||||
#define INIT_m8n4 INIT_m8n2 unit_init_m8n1(%%zmm12,%%zmm13) unit_init_m8n1(%%zmm14,%%zmm15)
|
||||
#define INIT_m8n6 INIT_m8n4 unit_init_m8n1(%%zmm16,%%zmm17) unit_init_m8n1(%%zmm18,%%zmm19)
|
||||
#define INIT_m8n8 INIT_m8n6 unit_init_m8n1(%%zmm20,%%zmm21) unit_init_m8n1(%%zmm22,%%zmm23)
|
||||
#define INIT_m8n10 INIT_m8n8 unit_init_m8n1(%%zmm24,%%zmm25) unit_init_m8n1(%%zmm26,%%zmm27)
|
||||
#define INIT_m8n12 INIT_m8n10 unit_init_m8n1(%%zmm28,%%zmm29) unit_init_m8n1(%%zmm30,%%zmm31)
|
||||
#define COMPUTE_m8(ndim) \
|
||||
INIT_m8n##ndim\
|
||||
COMPUTE_INIT "movq %2,%3;"\
|
||||
"cmpq $18,%6; jb "#ndim"88880f;"\
|
||||
#ndim"88889:\n\t"\
|
||||
KERNEL_t_k1m8n##ndim\
|
||||
KERNEL_t_k1m8n##ndim\
|
||||
KERNEL_t_k1m8n##ndim\
|
||||
"prefetcht1 (%3); prefetcht1 63(%3); addq %4,%3;"\
|
||||
KERNEL_t_k1m8n##ndim\
|
||||
KERNEL_t_k1m8n##ndim\
|
||||
KERNEL_t_k1m8n##ndim\
|
||||
"prefetcht1 (%8); addq $40,%8;"\
|
||||
"subq $6,%6; cmpq $18,%6; jnb "#ndim"88889b;"\
|
||||
"movq %2,%3;"\
|
||||
#ndim"88880:\n\t"\
|
||||
"testq %6,%6; jz "#ndim"88881f;"\
|
||||
"prefetcht0 (%3); prefetcht0 63(%3); addq %4,%3;"\
|
||||
KERNEL_t_k1m8n##ndim\
|
||||
"decq %6; jmp "#ndim"88880b;"\
|
||||
#ndim"88881:\n\t"\
|
||||
SAVE_m8n##ndim
|
||||
|
||||
/* m=4, ymm0-ymm3 for temporary use, ymm4-ymm15 for accumulators */
|
||||
#if CGEMM_SKX_MODE == 0 || CGEMM_SKX_MODE == 3 //conjg_a == conjg_b; ap = permilps($177,a0)
|
||||
#define unit_kernel_k1m4n1(a0,ap,b_off_r,b_off_i,c1,...) \
|
||||
"vbroadcastss "#b_off_i"("#__VA_ARGS__"),%%ymm2; vfmaddsub231ps "#ap",%%ymm2,"#c1";"\
|
||||
"vbroadcastss "#b_off_r"("#__VA_ARGS__"),%%ymm2; vfmaddsub231ps "#a0",%%ymm2,"#c1";"
|
||||
#else //conjg_a != conjg_b
|
||||
#define unit_kernel_k1m4n1(a0,ap,b_off_r,b_off_i,c1,...) \
|
||||
"vbroadcastss "#b_off_i"("#__VA_ARGS__"),%%ymm2; vfmsubadd231ps "#ap",%%ymm2,"#c1";"\
|
||||
"vbroadcastss "#b_off_r"("#__VA_ARGS__"),%%ymm2; vfmsubadd231ps "#a0",%%ymm2,"#c1";"
|
||||
#endif
|
||||
#define KERNEL_h_k1m4n1 \
|
||||
"vmovups (%0),%%ymm0; vpermilps $177,%%ymm0,%%ymm1; addq $32,%0;"\
|
||||
unit_kernel_k1m4n1(%%ymm0,%%ymm1,0,4,%%ymm4,%1)
|
||||
#define KERNEL_t_k1m4n1 KERNEL_h_k1m4n1 "addq $8,%1;"
|
||||
#define KERNEL_h_k1m4n2 KERNEL_h_k1m4n1 unit_kernel_k1m4n1(%%ymm0,%%ymm1,8,12,%%ymm5,%1)
|
||||
#define KERNEL_t_k1m4n2 KERNEL_h_k1m4n2 "addq $16,%1;"
|
||||
#define unit_kernel_k1m4n2(c1,c2,...) \
|
||||
unit_kernel_k1m4n1(%%ymm0,%%ymm1,0,4,c1,__VA_ARGS__)\
|
||||
unit_kernel_k1m4n1(%%ymm0,%%ymm1,8,12,c2,__VA_ARGS__)
|
||||
#define KERNEL_h_k1m4n4 KERNEL_h_k1m4n2 unit_kernel_k1m4n2(%%ymm6,%%ymm7,%1,%%r12,1)
|
||||
#define KERNEL_t_k1m4n4 KERNEL_h_k1m4n4 "addq $16,%1;"
|
||||
#define KERNEL_t_k1m4n6 KERNEL_h_k1m4n4 unit_kernel_k1m4n2(%%ymm8,%%ymm9,%1,%%r12,2) "addq $16,%1;"
|
||||
#define KERNEL_h_k1m4n8 KERNEL_t_k1m4n6 unit_kernel_k1m4n2(%%ymm10,%%ymm11,%%r15)
|
||||
#define KERNEL_t_k1m4n8 KERNEL_h_k1m4n8 "addq $16,%%r15;"
|
||||
#define KERNEL_h_k1m4n10 KERNEL_h_k1m4n8 unit_kernel_k1m4n2(%%ymm12,%%ymm13,%%r15,%%r12,1)
|
||||
#define KERNEL_t_k1m4n10 KERNEL_h_k1m4n10 "addq $16,%%r15;"
|
||||
#define KERNEL_h_k1m4n12 KERNEL_h_k1m4n10 unit_kernel_k1m4n2(%%ymm14,%%ymm15,%%r15,%%r12,2)
|
||||
#define KERNEL_t_k1m4n12 KERNEL_h_k1m4n12 "addq $16,%%r15;"
|
||||
#if CGEMM_SKX_MODE == 0 || CGEMM_SKX_MODE == 2 //not to do conjugation on a_block
|
||||
#define unit_save_m4n1(alp_r,alp_i,c1,...) \
|
||||
"vpermilps $177,"#c1",%%ymm3; vfmaddsub213ps ("#__VA_ARGS__"),"#alp_i",%%ymm3;"\
|
||||
"vfmaddsub213ps %%ymm3,"#alp_r","#c1";vmovups "#c1",("#__VA_ARGS__");"
|
||||
#else //do conjugation on a_block
|
||||
#define unit_save_m4n1(alp_r,alp_i,c1,...) \
|
||||
"vpermilps $177,"#c1",%%ymm3; vfmsubadd213ps ("#__VA_ARGS__"),"#alp_r","#c1";"\
|
||||
"vfmsubadd231ps %%ymm3,"#alp_i","#c1";vmovups "#c1",("#__VA_ARGS__");"
|
||||
#endif
|
||||
#define SAVE_SETUP_m4 "movq %2,%3; addq $32,%2; vbroadcastss (%5),%%ymm0; vbroadcastss 4(%5),%%ymm1;"
|
||||
#define SAVE_m4n1 SAVE_SETUP_m4 unit_save_m4n1(%%ymm0,%%ymm1,%%ymm4,%3)
|
||||
#define SAVE_m4n2 SAVE_m4n1 unit_save_m4n1(%%ymm0,%%ymm1,%%ymm5,%3,%4,1)
|
||||
#define unit_save_m4n2(c1,c2) \
|
||||
"leaq (%3,%4,2),%3;" unit_save_m4n1(%%ymm0,%%ymm1,c1,%3) unit_save_m4n1(%%ymm0,%%ymm1,c2,%3,%4,1)
|
||||
#define SAVE_m4n4 SAVE_m4n2 unit_save_m4n2(%%ymm6,%%ymm7)
|
||||
#define SAVE_m4n6 SAVE_m4n4 unit_save_m4n2(%%ymm8,%%ymm9)
|
||||
#define SAVE_m4n8 SAVE_m4n6 unit_save_m4n2(%%ymm10,%%ymm11)
|
||||
#define SAVE_m4n10 SAVE_m4n8 unit_save_m4n2(%%ymm12,%%ymm13)
|
||||
#define SAVE_m4n12 SAVE_m4n10 unit_save_m4n2(%%ymm14,%%ymm15)
|
||||
#define INIT_m4n1 "vpxor %%ymm4,%%ymm4,%%ymm4;"
|
||||
#define unit_init_m4n2(c1,c2) "vpxor "#c1","#c1","#c1"; vpxor "#c2","#c2","#c2";"
|
||||
#define INIT_m4n2 unit_init_m4n2(%%ymm4,%%ymm5)
|
||||
#define INIT_m4n4 INIT_m4n2 unit_init_m4n2(%%ymm6,%%ymm7)
|
||||
#define INIT_m4n6 INIT_m4n4 unit_init_m4n2(%%ymm8,%%ymm9)
|
||||
#define INIT_m4n8 INIT_m4n6 unit_init_m4n2(%%ymm10,%%ymm11)
|
||||
#define INIT_m4n10 INIT_m4n8 unit_init_m4n2(%%ymm12,%%ymm13)
|
||||
#define INIT_m4n12 INIT_m4n10 unit_init_m4n2(%%ymm14,%%ymm15)
|
||||
#define COMPUTE_m4(ndim) \
|
||||
INIT_m4n##ndim\
|
||||
COMPUTE_INIT\
|
||||
#ndim"88440:\n\t"\
|
||||
"testq %6,%6; jz "#ndim"88441f;"\
|
||||
KERNEL_t_k1m4n##ndim\
|
||||
"decq %6; jmp "#ndim"88440b;"\
|
||||
#ndim"88441:\n\t"\
|
||||
SAVE_m4n##ndim
|
||||
|
||||
/* m=2, xmm0-xmm3 for temporary use, xmm4-xmm15 for accumulators */
|
||||
#if CGEMM_SKX_MODE == 0 || CGEMM_SKX_MODE == 3 //conjg_a == conjg_b;
|
||||
#define unit_kernel_k1m2n1(a0,ap,b_off_r,b_off_i,c1,...) \
|
||||
"vbroadcastss "#b_off_i"("#__VA_ARGS__"),%%xmm2; vfmaddsub231ps "#ap",%%xmm2,"#c1";"\
|
||||
"vbroadcastss "#b_off_r"("#__VA_ARGS__"),%%xmm2; vfmaddsub231ps "#a0",%%xmm2,"#c1";"
|
||||
#else //conjg_a != conjg_b
|
||||
#define unit_kernel_k1m2n1(a0,ap,b_off_r,b_off_i,c1,...) \
|
||||
"vbroadcastss "#b_off_i"("#__VA_ARGS__"),%%xmm2; vfmsubadd231ps "#ap",%%xmm2,"#c1";"\
|
||||
"vbroadcastss "#b_off_r"("#__VA_ARGS__"),%%xmm2; vfmsubadd231ps "#a0",%%xmm2,"#c1";"
|
||||
#endif
|
||||
#define KERNEL_h_k1m2n1 \
|
||||
"vmovups (%0),%%xmm0; vpermilps $177,%%xmm0,%%xmm1; addq $16,%0;"\
|
||||
unit_kernel_k1m2n1(%%xmm0,%%xmm1,0,4,%%xmm4,%1)
|
||||
#define KERNEL_t_k1m2n1 KERNEL_h_k1m2n1 "addq $8,%1;"
|
||||
#define KERNEL_h_k1m2n2 KERNEL_h_k1m2n1 unit_kernel_k1m2n1(%%xmm0,%%xmm1,8,12,%%xmm5,%1)
|
||||
#define KERNEL_t_k1m2n2 KERNEL_h_k1m2n2 "addq $16,%1;"
|
||||
#define unit_kernel_k1m2n2(c1,c2,...) \
|
||||
unit_kernel_k1m2n1(%%xmm0,%%xmm1,0,4,c1,__VA_ARGS__)\
|
||||
unit_kernel_k1m2n1(%%xmm0,%%xmm1,8,12,c2,__VA_ARGS__)
|
||||
#define KERNEL_h_k1m2n4 KERNEL_h_k1m2n2 unit_kernel_k1m2n2(%%xmm6,%%xmm7,%1,%%r12,1)
|
||||
#define KERNEL_t_k1m2n4 KERNEL_h_k1m2n4 "addq $16,%1;"
|
||||
#define KERNEL_t_k1m2n6 KERNEL_h_k1m2n4 unit_kernel_k1m2n2(%%xmm8,%%xmm9,%1,%%r12,2) "addq $16,%1;"
|
||||
#define KERNEL_h_k1m2n8 KERNEL_t_k1m2n6 unit_kernel_k1m2n2(%%xmm10,%%xmm11,%%r15)
|
||||
#define KERNEL_t_k1m2n8 KERNEL_h_k1m2n8 "addq $16,%%r15;"
|
||||
#define KERNEL_h_k1m2n10 KERNEL_h_k1m2n8 unit_kernel_k1m2n2(%%xmm12,%%xmm13,%%r15,%%r12,1)
|
||||
#define KERNEL_t_k1m2n10 KERNEL_h_k1m2n10 "addq $16,%%r15;"
|
||||
#define KERNEL_h_k1m2n12 KERNEL_h_k1m2n10 unit_kernel_k1m2n2(%%xmm14,%%xmm15,%%r15,%%r12,2)
|
||||
#define KERNEL_t_k1m2n12 KERNEL_h_k1m2n12 "addq $16,%%r15;"
|
||||
#if CGEMM_SKX_MODE == 0 || CGEMM_SKX_MODE == 2 //not to do conjugation on a_block
|
||||
#define unit_save_m2n1(alp_r,alp_i,c1,...) \
|
||||
"vpermilps $177,"#c1",%%xmm3; vfmaddsub213ps ("#__VA_ARGS__"),"#alp_i",%%xmm3;"\
|
||||
"vfmaddsub213ps %%xmm3,"#alp_r","#c1";vmovups "#c1",("#__VA_ARGS__");"
|
||||
#else //do conjugation on a_block
|
||||
#define unit_save_m2n1(alp_r,alp_i,c1,...) \
|
||||
"vpermilps $177,"#c1",%%xmm3; vfmsubadd213ps ("#__VA_ARGS__"),"#alp_r","#c1";"\
|
||||
"vfmsubadd231ps %%xmm3,"#alp_i","#c1";vmovups "#c1",("#__VA_ARGS__");"
|
||||
#endif
|
||||
#define SAVE_SETUP_m2 "movq %2,%3; addq $16,%2; vbroadcastss (%5),%%xmm0; vbroadcastss 4(%5),%%xmm1;"
|
||||
#define SAVE_m2n1 SAVE_SETUP_m2 unit_save_m2n1(%%xmm0,%%xmm1,%%xmm4,%3)
|
||||
#define SAVE_m2n2 SAVE_m2n1 unit_save_m2n1(%%xmm0,%%xmm1,%%xmm5,%3,%4,1)
|
||||
#define unit_save_m2n2(c1,c2) \
|
||||
"leaq (%3,%4,2),%3;" unit_save_m2n1(%%xmm0,%%xmm1,c1,%3) unit_save_m2n1(%%xmm0,%%xmm1,c2,%3,%4,1)
|
||||
#define SAVE_m2n4 SAVE_m2n2 unit_save_m2n2(%%xmm6,%%xmm7)
|
||||
#define SAVE_m2n6 SAVE_m2n4 unit_save_m2n2(%%xmm8,%%xmm9)
|
||||
#define SAVE_m2n8 SAVE_m2n6 unit_save_m2n2(%%xmm10,%%xmm11)
|
||||
#define SAVE_m2n10 SAVE_m2n8 unit_save_m2n2(%%xmm12,%%xmm13)
|
||||
#define SAVE_m2n12 SAVE_m2n10 unit_save_m2n2(%%xmm14,%%xmm15)
|
||||
#define INIT_m2n1 "vpxor %%xmm4,%%xmm4,%%xmm4;"
|
||||
#define unit_init_m2n2(c1,c2) "vpxor "#c1","#c1","#c1"; vpxor "#c2","#c2","#c2";"
|
||||
#define INIT_m2n2 unit_init_m2n2(%%xmm4,%%xmm5)
|
||||
#define INIT_m2n4 INIT_m2n2 unit_init_m2n2(%%xmm6,%%xmm7)
|
||||
#define INIT_m2n6 INIT_m2n4 unit_init_m2n2(%%xmm8,%%xmm9)
|
||||
#define INIT_m2n8 INIT_m2n6 unit_init_m2n2(%%xmm10,%%xmm11)
|
||||
#define INIT_m2n10 INIT_m2n8 unit_init_m2n2(%%xmm12,%%xmm13)
|
||||
#define INIT_m2n12 INIT_m2n10 unit_init_m2n2(%%xmm14,%%xmm15)
|
||||
#define COMPUTE_m2(ndim) \
|
||||
INIT_m2n##ndim\
|
||||
COMPUTE_INIT\
|
||||
#ndim"88220:\n\t"\
|
||||
"testq %6,%6; jz "#ndim"88221f;"\
|
||||
KERNEL_t_k1m2n##ndim\
|
||||
"decq %6; jmp "#ndim"88220b;"\
|
||||
#ndim"88221:\n\t"\
|
||||
SAVE_m2n##ndim
|
||||
|
||||
/* m=1, xmm0-xmm3 and xmm10-xmm15 for temporary use, xmm4-xmm9 for accumulators */
|
||||
#if CGEMM_SKX_MODE == 0 || CGEMM_SKX_MODE == 3 //conjg_a == conjg_b; ap = permilps($177,a0)
|
||||
#define unit_kernel_k1m1n1(a0,ap,b_off_r,b_off_i,c1,...) \
|
||||
"vbroadcastss "#b_off_i"("#__VA_ARGS__"),%%xmm2; vfmaddsub231ps "#ap",%%xmm2,"#c1";"\
|
||||
"vbroadcastss "#b_off_r"("#__VA_ARGS__"),%%xmm2; vfmaddsub231ps "#a0",%%xmm2,"#c1";"
|
||||
#define unit_kernel_k1m1n2(a0,ap,c1,...) \
|
||||
"vmovshdup ("#__VA_ARGS__"),%%xmm2; vfmaddsub231ps "#ap",%%xmm2,"#c1";"\
|
||||
"vmovsldup ("#__VA_ARGS__"),%%xmm2; vfmaddsub231ps "#a0",%%xmm2,"#c1";"
|
||||
#else //conjg_a != conjg_b
|
||||
#define unit_kernel_k1m1n1(a0,ap,b_off_r,b_off_i,c1,...) \
|
||||
"vbroadcastss "#b_off_i"("#__VA_ARGS__"),%%xmm2; vfmsubadd231ps "#ap",%%xmm2,"#c1";"\
|
||||
"vbroadcastss "#b_off_r"("#__VA_ARGS__"),%%xmm2; vfmsubadd231ps "#a0",%%xmm2,"#c1";"
|
||||
#define unit_kernel_k1m1n2(a0,ap,c1,...) \
|
||||
"vmovshdup ("#__VA_ARGS__"),%%xmm2; vfmsubadd231ps "#ap",%%xmm2,"#c1";"\
|
||||
"vmovsldup ("#__VA_ARGS__"),%%xmm2; vfmsubadd231ps "#a0",%%xmm2,"#c1";"
|
||||
#endif
|
||||
#define KERNEL_h_k1m1n1 \
|
||||
"vmovsd (%0),%%xmm0; vpermilps $177,%%xmm0,%%xmm1; addq $8,%0;"\
|
||||
unit_kernel_k1m1n1(%%xmm0,%%xmm1,0,4,%%xmm4,%1)
|
||||
#define KERNEL_t_k1m1n1 KERNEL_h_k1m1n1 "addq $8,%1;"
|
||||
#define KERNEL_h_k1m1n2 \
|
||||
"vmovddup (%0),%%xmm0; vpermilps $177,%%xmm0,%%xmm1; addq $8,%0;"\
|
||||
unit_kernel_k1m1n2(%%xmm0,%%xmm1,%%xmm4,%1)
|
||||
#define KERNEL_t_k1m1n2 KERNEL_h_k1m1n2 "addq $16,%1;"
|
||||
#define KERNEL_h_k1m1n4 KERNEL_h_k1m1n2 unit_kernel_k1m1n2(%%xmm0,%%xmm1,%%xmm5,%1,%%r12,1)
|
||||
#define KERNEL_t_k1m1n4 KERNEL_h_k1m1n4 "addq $16,%1;"
|
||||
#define KERNEL_t_k1m1n6 KERNEL_h_k1m1n4 unit_kernel_k1m1n2(%%xmm0,%%xmm1,%%xmm6,%1,%%r12,2) "addq $16,%1;"
|
||||
#define KERNEL_h_k1m1n8 KERNEL_t_k1m1n6 unit_kernel_k1m1n2(%%xmm0,%%xmm1,%%xmm7,%%r15)
|
||||
#define KERNEL_t_k1m1n8 KERNEL_h_k1m1n8 "addq $16,%%r15;"
|
||||
#define KERNEL_h_k1m1n10 KERNEL_h_k1m1n8 unit_kernel_k1m1n2(%%xmm0,%%xmm1,%%xmm8,%%r15,%%r12,1)
|
||||
#define KERNEL_t_k1m1n10 KERNEL_h_k1m1n10 "addq $16,%%r15;"
|
||||
#define KERNEL_h_k1m1n12 KERNEL_h_k1m1n10 unit_kernel_k1m1n2(%%xmm0,%%xmm1,%%xmm9,%%r15,%%r12,2)
|
||||
#define KERNEL_t_k1m1n12 KERNEL_h_k1m1n12 "addq $16,%%r15;"
|
||||
#if CGEMM_SKX_MODE == 0 || CGEMM_SKX_MODE == 2 //not to do conjugation on a_block
|
||||
#define unit_save_m1n1(alp_r,alp_i,c1,...) \
|
||||
"vpermilps $177,"#c1",%%xmm3; vmovsd ("#__VA_ARGS__"),%%xmm2; vfmaddsub213ps %%xmm2,"#alp_i",%%xmm3;"\
|
||||
"vfmaddsub213ps %%xmm3,"#alp_r","#c1";vmovsd "#c1",("#__VA_ARGS__");"
|
||||
#define unit_save_m1n2(alp_r,alp_i,c1) \
|
||||
"vpermilps $177,"#c1",%%xmm3; vmovsd (%3),%%xmm2; vmovhpd (%3,%4,1),%%xmm2,%%xmm2;"\
|
||||
"vfmaddsub213ps %%xmm2,"#alp_i",%%xmm3; vfmaddsub231ps "#c1","#alp_r",%%xmm3;"\
|
||||
"vmovsd %%xmm3,(%3); vmovhpd %%xmm3,(%3,%4,1); leaq (%3,%4,2),%3;"
|
||||
#else //do conjugation on a_block
|
||||
#define unit_save_m1n1(alp_r,alp_i,c1,...) \
|
||||
"vpermilps $177,"#c1",%%xmm3; vmovsd ("#__VA_ARGS__"),%%xmm2; vfmsubadd213ps %%xmm2,"#alp_r","#c1";"\
|
||||
"vfmsubadd231ps %%xmm3,"#alp_i","#c1";vmovsd "#c1",("#__VA_ARGS__");"
|
||||
#define unit_save_m1n2(alp_r,alp_i,c1) \
|
||||
"vpermilps $177,"#c1",%%xmm3; vmovsd (%3),%%xmm2; vmovhpd (%3,%4,1),%%xmm2,%%xmm2;"\
|
||||
"vfmsubadd213ps %%xmm2,"#alp_r","#c1"; vfmsubadd213ps "#c1","#alp_i",%%xmm3;"\
|
||||
"vmovsd %%xmm3,(%3); vmovhpd %%xmm3,(%3,%4,1); leaq (%3,%4,2),%3;"
|
||||
#endif
|
||||
#define SAVE_SETUP_m1 "movq %2,%3; addq $8,%2; vbroadcastss (%5),%%xmm0; vbroadcastss 4(%5),%%xmm1;"
|
||||
#define SAVE_m1n1 SAVE_SETUP_m1 unit_save_m1n1(%%xmm0,%%xmm1,%%xmm4,%3)
|
||||
#define SAVE_m1n2 SAVE_SETUP_m1 unit_save_m1n2(%%xmm0,%%xmm1,%%xmm4)
|
||||
#define SAVE_m1n4 SAVE_m1n2 unit_save_m1n2(%%xmm0,%%xmm1,%%xmm5)
|
||||
#define SAVE_m1n6 SAVE_m1n4 unit_save_m1n2(%%xmm0,%%xmm1,%%xmm6)
|
||||
#define SAVE_m1n8 SAVE_m1n6 unit_save_m1n2(%%xmm0,%%xmm1,%%xmm7)
|
||||
#define SAVE_m1n10 SAVE_m1n8 unit_save_m1n2(%%xmm0,%%xmm1,%%xmm8)
|
||||
#define SAVE_m1n12 SAVE_m1n10 unit_save_m1n2(%%xmm0,%%xmm1,%%xmm9)
|
||||
#define INIT_m1n1 "vpxor %%xmm4,%%xmm4,%%xmm4;"
|
||||
#define INIT_m1n2 INIT_m2n1
|
||||
#define INIT_m1n4 INIT_m1n2 "vpxor %%xmm5,%%xmm5,%%xmm5;"
|
||||
#define INIT_m1n6 INIT_m1n4 "vpxor %%xmm6,%%xmm6,%%xmm6;"
|
||||
#define INIT_m1n8 INIT_m1n6 "vpxor %%xmm7,%%xmm7,%%xmm7;"
|
||||
#define INIT_m1n10 INIT_m1n8 "vpxor %%xmm8,%%xmm8,%%xmm8;"
|
||||
#define INIT_m1n12 INIT_m1n10 "vpxor %%xmm9,%%xmm9,%%xmm9;"
|
||||
#define COMPUTE_m1(ndim) \
|
||||
INIT_m1n##ndim\
|
||||
COMPUTE_INIT\
|
||||
#ndim"88110:\n\t"\
|
||||
"testq %6,%6; jz "#ndim"88111f;"\
|
||||
KERNEL_t_k1m1n##ndim\
|
||||
"decq %6; jmp "#ndim"88110b;"\
|
||||
#ndim"88111:\n\t"\
|
||||
SAVE_m1n##ndim
|
||||
|
||||
#define COMPUTE(ndim) {\
|
||||
b_pref = b_pointer + ndim * K * 2;\
|
||||
__asm__ __volatile__(\
|
||||
GENERAL_INIT\
|
||||
CONSTZMM_INIT\
|
||||
"cmpq $8,%7;jb 33101"#ndim"f;"\
|
||||
"33109"#ndim":\n\t"\
|
||||
COMPUTE_m8(ndim)\
|
||||
"subq $8,%7;cmpq $8,%7;jnb 33109"#ndim"b;"\
|
||||
"33101"#ndim":\n\t"\
|
||||
"cmpq $4,%7;jb 33102"#ndim"f;"\
|
||||
COMPUTE_m4(ndim)\
|
||||
"subq $4,%7;"\
|
||||
"33102"#ndim":\n\t"\
|
||||
"cmpq $2,%7;jb 33103"#ndim"f;"\
|
||||
COMPUTE_m2(ndim)\
|
||||
"subq $2,%7;"\
|
||||
"33103"#ndim":\n\t"\
|
||||
"testq %7,%7;jz 33104"#ndim"f;"\
|
||||
COMPUTE_m1(ndim)\
|
||||
"33104"#ndim":\n\t"\
|
||||
GENERAL_RECOVER\
|
||||
:"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(c_store),"+r"(ldc_in_bytes),"+r"(constval),"+r"(K),"+r"(M),"+r"(b_pref)\
|
||||
::"r11","r12","r13","r14","r15","zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14",\
|
||||
"zmm15","zmm16","zmm17","zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31",\
|
||||
"cc","memory");\
|
||||
a_pointer -= M * K * 2; b_pointer += ndim * K * 2; c_pointer += (LDC * ndim - M) * 2;\
|
||||
}
|
||||
|
||||
int __attribute__ ((noinline))
|
||||
CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alphar, float alphai, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG LDC)
|
||||
{
|
||||
if(m==0||n==0||k==0) return 0;
|
||||
int64_t ldc_in_bytes = (int64_t)LDC * sizeof(float) * 2; float const_val[4] = {alphar, alphai, -1, 1};
|
||||
int64_t M = (int64_t)m, K = (int64_t)k;
|
||||
BLASLONG n_count = n;
|
||||
float *a_pointer = A,*b_pointer = B,*c_pointer = C,*c_store = C,*constval = const_val,*b_pref = B;
|
||||
for(;n_count>11;n_count-=12) COMPUTE(12)
|
||||
for(;n_count>9;n_count-=10) COMPUTE(10)
|
||||
for(;n_count>7;n_count-=8) COMPUTE(8)
|
||||
for(;n_count>5;n_count-=6) COMPUTE(6)
|
||||
for(;n_count>3;n_count-=4) COMPUTE(4)
|
||||
for(;n_count>1;n_count-=2) COMPUTE(2)
|
||||
if(n_count>0) COMPUTE(1)
|
||||
return 0;
|
||||
}
|
||||
678
kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c
Normal file
678
kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c
Normal file
@@ -0,0 +1,678 @@
|
||||
#include "common.h"
|
||||
#include <stdint.h>
|
||||
#include <immintrin.h>
|
||||
|
||||
//register usage: zmm3 for alpha, zmm4-zmm7 for temporary use, zmm8-zmm31 for accumulators.
|
||||
/* row-major c_block */
|
||||
#define INNER_KERNEL_k1m1n8 \
|
||||
"prefetcht0 384(%1);"\
|
||||
"vmovupd (%1),%%zmm5; addq $64,%1;"\
|
||||
"vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8;"
|
||||
|
||||
#define INNER_KERNEL_k1m2n8 \
|
||||
INNER_KERNEL_k1m1n8\
|
||||
"vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm9;"
|
||||
|
||||
#define INNER_KERNEL_k1m4n8 \
|
||||
INNER_KERNEL_k1m2n8\
|
||||
"vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;"\
|
||||
"vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;"
|
||||
|
||||
#define INNER_KERNEL_k1m8n8 \
|
||||
INNER_KERNEL_k1m4n8\
|
||||
"vbroadcastsd (%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm12;"\
|
||||
"vbroadcastsd 8(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm13;"\
|
||||
"vbroadcastsd 16(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;"\
|
||||
"vbroadcastsd 24(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm15;"
|
||||
|
||||
#define INNER_KERNEL_k1m1n16 \
|
||||
"prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2);"\
|
||||
"vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; addq $64,%1;"\
|
||||
"vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9;"
|
||||
|
||||
#define INNER_KERNEL_k1m2n16 \
|
||||
INNER_KERNEL_k1m1n16\
|
||||
"vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;vfmadd231pd %%zmm6,%%zmm4,%%zmm11;"
|
||||
|
||||
#define INNER_KERNEL_k1m4n16 \
|
||||
INNER_KERNEL_k1m2n16\
|
||||
"vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm12;vfmadd231pd %%zmm6,%%zmm4,%%zmm13;"\
|
||||
"vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;vfmadd231pd %%zmm6,%%zmm4,%%zmm15;"
|
||||
|
||||
#define INNER_KERNEL_k1m8n16 \
|
||||
INNER_KERNEL_k1m4n16\
|
||||
"vbroadcastsd (%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm16;vfmadd231pd %%zmm6,%%zmm4,%%zmm17;"\
|
||||
"vbroadcastsd 8(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm18;vfmadd231pd %%zmm6,%%zmm4,%%zmm19;"\
|
||||
"vbroadcastsd 16(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm20;vfmadd231pd %%zmm6,%%zmm4,%%zmm21;"\
|
||||
"vbroadcastsd 24(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm22;vfmadd231pd %%zmm6,%%zmm4,%%zmm23;"
|
||||
|
||||
#define INNER_KERNEL_k1m1n24 \
|
||||
"prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2); prefetcht0 128(%1,%%r12,4);"\
|
||||
"vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; vmovupd (%1,%%r12,4),%%zmm7; addq $64,%1;"\
|
||||
"vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9; vfmadd231pd %%zmm7,%%zmm4,%%zmm10;"
|
||||
|
||||
#define INNER_KERNEL_k1m2n24 \
|
||||
INNER_KERNEL_k1m1n24\
|
||||
"vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;vfmadd231pd %%zmm6,%%zmm4,%%zmm12;vfmadd231pd %%zmm7,%%zmm4,%%zmm13;"
|
||||
|
||||
#define INNER_KERNEL_k1m4n24 \
|
||||
INNER_KERNEL_k1m2n24\
|
||||
"vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;vfmadd231pd %%zmm6,%%zmm4,%%zmm15;vfmadd231pd %%zmm7,%%zmm4,%%zmm16;"\
|
||||
"vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm17;vfmadd231pd %%zmm6,%%zmm4,%%zmm18;vfmadd231pd %%zmm7,%%zmm4,%%zmm19;"
|
||||
|
||||
#define INNER_KERNEL_k1m8n24 \
|
||||
INNER_KERNEL_k1m4n24\
|
||||
"vbroadcastsd (%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm20;vfmadd231pd %%zmm6,%%zmm4,%%zmm21;vfmadd231pd %%zmm7,%%zmm4,%%zmm22;"\
|
||||
"vbroadcastsd 8(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm23;vfmadd231pd %%zmm6,%%zmm4,%%zmm24;vfmadd231pd %%zmm7,%%zmm4,%%zmm25;"\
|
||||
"vbroadcastsd 16(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm26;vfmadd231pd %%zmm6,%%zmm4,%%zmm27;vfmadd231pd %%zmm7,%%zmm4,%%zmm28;"\
|
||||
"vbroadcastsd 24(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm29;vfmadd231pd %%zmm6,%%zmm4,%%zmm30;vfmadd231pd %%zmm7,%%zmm4,%%zmm31;"
|
||||
|
||||
#define INNER_KERNELm1(nn) \
|
||||
"cmpq $1,%2;jb "#nn"3f;"\
|
||||
#nn"4:\n\t"\
|
||||
INNER_KERNEL_k1m1n##nn "addq $8,%0;"\
|
||||
"decq %2;cmpq $1,%2;jnb "#nn"4b;"\
|
||||
#nn"3:\n\t"
|
||||
|
||||
#define INNER_KERNELm2(nn) \
|
||||
"cmpq $1,%2;jb "#nn"0f;"\
|
||||
#nn"1:\n\t"\
|
||||
INNER_KERNEL_k1m2n##nn "addq $16,%0;"\
|
||||
"decq %2;cmpq $1,%2;jnb "#nn"1b;"\
|
||||
#nn"0:\n\t"
|
||||
|
||||
#define INNER_KERNELm4(nn) \
|
||||
"cmpq $1,%2;jb "#nn"00f;"\
|
||||
#nn"01:\n\t"\
|
||||
INNER_KERNEL_k1m4n##nn "addq $32,%0;"\
|
||||
"decq %2;cmpq $1,%2;jnb "#nn"01b;"\
|
||||
#nn"00:\n\t"
|
||||
|
||||
/* %10 for prefetch of C elements before storage; %4 = ldc(in bytes),%11 for prefetch of next B block */
|
||||
#define INNER_KERNELm8(nn) \
|
||||
"movq %3,%10;cmpq $16,%2;jb "#nn"001f;"\
|
||||
#nn"008:\n\t"\
|
||||
INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
|
||||
INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
|
||||
"prefetcht1 (%10); prefetcht1 63(%10); addq %4,%10;"\
|
||||
INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
|
||||
INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
|
||||
"prefetcht1 (%11); addq $16,%11;"\
|
||||
"subq $4,%2;cmpq $16,%2;jnb "#nn"008b;"\
|
||||
"movq %3,%10;"\
|
||||
#nn"001:\n\t"\
|
||||
"cmpq $1,%2;jb "#nn"000f;"\
|
||||
"prefetcht0 (%10); prefetcht0 63(%10); prefetcht0 (%10,%4,1); prefetcht0 63(%10,%4,1); leaq (%10,%4,2),%10;"\
|
||||
INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
|
||||
"decq %2;jmp "#nn"001b;"\
|
||||
""#nn"000:\n\t"
|
||||
|
||||
#define INNER_INIT_m1n8 \
|
||||
"vpxorq %%zmm8, %%zmm8, %%zmm8;"
|
||||
|
||||
#define INNER_INIT_m2n8 \
|
||||
"vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9;"
|
||||
|
||||
#define INNER_INIT_m4n8 \
|
||||
"vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9; vpxorq %%zmm10,%%zmm10,%%zmm10;vpxorq %%zmm11,%%zmm11,%%zmm11;"
|
||||
|
||||
#define INNER_INIT_m8n8 \
|
||||
INNER_INIT_m4n8\
|
||||
"vpxorq %%zmm12,%%zmm12,%%zmm12;vpxorq %%zmm13,%%zmm13,%%zmm13;vpxorq %%zmm14,%%zmm14,%%zmm14;vpxorq %%zmm15,%%zmm15,%%zmm15;"
|
||||
|
||||
#define INNER_INIT_m1n16 INNER_INIT_m2n8
|
||||
|
||||
#define INNER_INIT_m2n16 INNER_INIT_m4n8
|
||||
|
||||
#define INNER_INIT_m4n16 INNER_INIT_m8n8
|
||||
|
||||
#define INNER_INIT_m8n16 \
|
||||
INNER_INIT_m8n8\
|
||||
"vpxorq %%zmm16,%%zmm16,%%zmm16;vpxorq %%zmm17,%%zmm17,%%zmm17;vpxorq %%zmm18,%%zmm18,%%zmm18;vpxorq %%zmm19,%%zmm19,%%zmm19;"\
|
||||
"vpxorq %%zmm20,%%zmm20,%%zmm20;vpxorq %%zmm21,%%zmm21,%%zmm21;vpxorq %%zmm22,%%zmm22,%%zmm22;vpxorq %%zmm23,%%zmm23,%%zmm23;"
|
||||
|
||||
#define INNER_INIT_m1n24 \
|
||||
"vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9; vpxorq %%zmm10,%%zmm10,%%zmm10;"
|
||||
|
||||
#define INNER_INIT_m2n24 \
|
||||
INNER_INIT_m1n24\
|
||||
"vpxorq %%zmm11,%%zmm11,%%zmm11; vpxorq %%zmm12,%%zmm12,%%zmm12; vpxorq %%zmm13,%%zmm13,%%zmm13;"
|
||||
|
||||
#define INNER_INIT_m4n24 \
|
||||
INNER_INIT_m4n16\
|
||||
"vpxorq %%zmm16,%%zmm16,%%zmm16;vpxorq %%zmm17,%%zmm17,%%zmm17;vpxorq %%zmm18,%%zmm18,%%zmm18;vpxorq %%zmm19,%%zmm19,%%zmm19;"
|
||||
|
||||
#define INNER_INIT_m8n24 \
|
||||
INNER_INIT_m8n16\
|
||||
"vpxorq %%zmm24,%%zmm24,%%zmm24;vpxorq %%zmm25,%%zmm25,%%zmm25;vpxorq %%zmm26,%%zmm26,%%zmm26;vpxorq %%zmm27,%%zmm27,%%zmm27;"\
|
||||
"vpxorq %%zmm28,%%zmm28,%%zmm28;vpxorq %%zmm29,%%zmm29,%%zmm29;vpxorq %%zmm30,%%zmm30,%%zmm30;vpxorq %%zmm31,%%zmm31,%%zmm31;"
|
||||
|
||||
#define INNER_SETINDEX \
|
||||
"vpinsrq $0,%4,%%xmm4,%%xmm4; vbroadcastsd %%xmm4,%%zmm4;"\
|
||||
"kxnorw %%k1,%%k1,%%k1; kshiftlw $1,%%k1,%%k1; vpxorq %%zmm6,%%zmm6,%%zmm6; vmovapd %%zmm4,%%zmm6%{%%k1%};"\
|
||||
"kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\
|
||||
"kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\
|
||||
"kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\
|
||||
"kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\
|
||||
"kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\
|
||||
"kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"
|
||||
|
||||
#define INNER_STORE_m1n8(c1,disp) \
|
||||
"kxnorw %%k1,%%k1,%%k1;"\
|
||||
"vgatherqpd "#disp"(%10,%%zmm6,1), %%zmm7 %{%%k1%};"\
|
||||
"vfmadd132pd %%zmm3,%%zmm7,"#c1";"\
|
||||
"kxnorw %%k1,%%k1,%%k1;"\
|
||||
"vscatterqpd "#c1", "#disp"(%10,%%zmm6,1) %{%%k1%};"
|
||||
|
||||
#define INNER_SAVE_m1n8 \
|
||||
"movq %3,%10;"\
|
||||
INNER_SETINDEX\
|
||||
INNER_STORE_m1n8(%%zmm8,0)
|
||||
|
||||
#define INNER_SAVE_m1n16 \
|
||||
INNER_SAVE_m1n8\
|
||||
"leaq (%10,%4,8),%10;"\
|
||||
INNER_STORE_m1n8(%%zmm9,0)
|
||||
|
||||
#define INNER_SAVE_m1n24 \
|
||||
INNER_SAVE_m1n16\
|
||||
"leaq (%10,%4,8),%10;"\
|
||||
INNER_STORE_m1n8(%%zmm10,0)
|
||||
|
||||
#define INNER_SAVE_m2n8 \
|
||||
"movq %3,%10;"\
|
||||
INNER_SETINDEX\
|
||||
INNER_STORE_m1n8(%%zmm8,0)\
|
||||
INNER_STORE_m1n8(%%zmm9,8)
|
||||
|
||||
#define INNER_SAVE_m2n16 \
|
||||
"movq %3,%10;"\
|
||||
INNER_SETINDEX\
|
||||
INNER_STORE_m1n8(%%zmm8,0)\
|
||||
INNER_STORE_m1n8(%%zmm10,8)\
|
||||
"leaq (%10,%4,8),%10;"\
|
||||
INNER_STORE_m1n8(%%zmm9,0)\
|
||||
INNER_STORE_m1n8(%%zmm11,8)
|
||||
|
||||
#define INNER_SAVE_m2n24 \
|
||||
"movq %3,%10;"\
|
||||
INNER_SETINDEX\
|
||||
INNER_STORE_m1n8(%%zmm8,0)\
|
||||
INNER_STORE_m1n8(%%zmm11,8)\
|
||||
"leaq (%10,%4,8),%10;"\
|
||||
INNER_STORE_m1n8(%%zmm9,0)\
|
||||
INNER_STORE_m1n8(%%zmm12,8)\
|
||||
"leaq (%10,%4,8),%10;"\
|
||||
INNER_STORE_m1n8(%%zmm10,0)\
|
||||
INNER_STORE_m1n8(%%zmm13,8)
|
||||
|
||||
#define INNER_TRANS_4x8(c1,c2,c3,c4) \
|
||||
"vunpcklpd "#c2","#c1",%%zmm4;vunpckhpd "#c2","#c1",%%zmm5;vunpcklpd "#c4","#c3",%%zmm6;vunpckhpd "#c4","#c3",%%zmm7;"\
|
||||
"vblendmpd %%zmm6,%%zmm4,"#c1"%{%6%};vblendmpd %%zmm7,%%zmm5,"#c3"%{%6%};"\
|
||||
"vshuff64x2 $0xb1,"#c1","#c1","#c1";vshuff64x2 $0xb1,"#c3","#c3","#c3";"\
|
||||
"vblendmpd %%zmm4,"#c1",%%zmm4%{%6%};vblendmpd %%zmm5,"#c3","#c2"%{%6%};"\
|
||||
"vblendmpd "#c1",%%zmm6,%%zmm6%{%6%};vblendmpd "#c3",%%zmm7,"#c4"%{%6%};"\
|
||||
"vmovapd %%zmm4,"#c1"; vmovapd %%zmm6,"#c3";"
|
||||
|
||||
#define INNER_TRANS_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \
|
||||
INNER_TRANS_4x8(c1,c2,c3,c4)\
|
||||
INNER_TRANS_4x8(c5,c6,c7,c8)\
|
||||
"vblendmpd "#c5","#c1",%%zmm4%{%5%};vshuff64x2 $0x4e,%%zmm4,%%zmm4,%%zmm4;"\
|
||||
"vblendmpd "#c1",%%zmm4,"#c1"%{%5%};vblendmpd %%zmm4,"#c5","#c5"%{%5%};"\
|
||||
"vblendmpd "#c6","#c2",%%zmm5%{%5%};vshuff64x2 $0x4e,%%zmm5,%%zmm5,%%zmm5;"\
|
||||
"vblendmpd "#c2",%%zmm5,"#c2"%{%5%};vblendmpd %%zmm5,"#c6","#c6"%{%5%};"\
|
||||
"vblendmpd "#c7","#c3",%%zmm6%{%5%};vshuff64x2 $0x4e,%%zmm6,%%zmm6,%%zmm6;"\
|
||||
"vblendmpd "#c3",%%zmm6,"#c3"%{%5%};vblendmpd %%zmm6,"#c7","#c7"%{%5%};"\
|
||||
"vblendmpd "#c8","#c4",%%zmm7%{%5%};vshuff64x2 $0x4e,%%zmm7,%%zmm7,%%zmm7;"\
|
||||
"vblendmpd "#c4",%%zmm7,"#c4"%{%5%};vblendmpd %%zmm7,"#c8","#c8"%{%5%};"
|
||||
|
||||
//%7 for k01(input) only when m=4
|
||||
#define INNER_STORE_4x8(c1,c2,c3,c4) \
|
||||
"vmovupd (%10),%%zmm4%{%5%};vmovupd -32(%10,%4,4),%%zmm4%{%7%};vfmadd132pd %%zmm3,%%zmm4,"#c1";"\
|
||||
"vmovupd "#c1",(%10)%{%5%}; vmovupd "#c1",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\
|
||||
"vmovupd (%10),%%zmm5%{%5%};vmovupd -32(%10,%4,4),%%zmm5%{%7%};vfmadd132pd %%zmm3,%%zmm5,"#c2";"\
|
||||
"vmovupd "#c2",(%10)%{%5%}; vmovupd "#c2",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\
|
||||
"vmovupd (%10),%%zmm6%{%5%};vmovupd -32(%10,%4,4),%%zmm6%{%7%};vfmadd132pd %%zmm3,%%zmm6,"#c3";"\
|
||||
"vmovupd "#c3",(%10)%{%5%}; vmovupd "#c3",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\
|
||||
"vmovupd (%10),%%zmm7%{%5%};vmovupd -32(%10,%4,4),%%zmm7%{%7%};vfmadd132pd %%zmm3,%%zmm7,"#c4";"\
|
||||
"vmovupd "#c4",(%10)%{%5%}; vmovupd "#c4",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\
|
||||
"leaq (%10,%4,4),%10;"
|
||||
|
||||
#define INNER_STORE_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \
|
||||
"vfmadd213pd (%10),%%zmm3,"#c1"; vmovupd "#c1",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c2"; vmovupd "#c2",(%10,%4,1); leaq (%10,%4,2),%10;"\
|
||||
"vfmadd213pd (%10),%%zmm3,"#c3"; vmovupd "#c3",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c4"; vmovupd "#c4",(%10,%4,1); leaq (%10,%4,2),%10;"\
|
||||
"vfmadd213pd (%10),%%zmm3,"#c5"; vmovupd "#c5",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c6"; vmovupd "#c6",(%10,%4,1); leaq (%10,%4,2),%10;"\
|
||||
"vfmadd213pd (%10),%%zmm3,"#c7"; vmovupd "#c7",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c8"; vmovupd "#c8",(%10,%4,1); leaq (%10,%4,2),%10;"
|
||||
|
||||
#define INNER_SAVE_m4n8 \
|
||||
"movq %3,%10;"\
|
||||
INNER_TRANS_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11)\
|
||||
INNER_STORE_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11)
|
||||
|
||||
#define INNER_SAVE_m4n16 \
|
||||
"movq %3,%10;"\
|
||||
INNER_TRANS_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\
|
||||
INNER_STORE_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\
|
||||
INNER_TRANS_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15)\
|
||||
INNER_STORE_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15)
|
||||
|
||||
#define INNER_SAVE_m4n24 \
|
||||
"movq %3,%10;"\
|
||||
INNER_TRANS_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\
|
||||
INNER_STORE_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\
|
||||
INNER_TRANS_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\
|
||||
INNER_STORE_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\
|
||||
INNER_TRANS_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19)\
|
||||
INNER_STORE_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19)
|
||||
|
||||
#define INNER_SAVE_m8n8 \
|
||||
"movq %3,%10;"\
|
||||
INNER_TRANS_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15)\
|
||||
INNER_STORE_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15)
|
||||
|
||||
#define INNER_SAVE_m8n16 \
|
||||
"movq %3,%10;"\
|
||||
INNER_TRANS_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\
|
||||
INNER_STORE_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\
|
||||
INNER_TRANS_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23)\
|
||||
INNER_STORE_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23)
|
||||
|
||||
#define INNER_SAVE_m8n24 \
|
||||
"movq %3,%10;"\
|
||||
INNER_TRANS_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\
|
||||
INNER_STORE_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\
|
||||
INNER_TRANS_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\
|
||||
INNER_STORE_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\
|
||||
INNER_TRANS_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31)\
|
||||
INNER_STORE_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31)
|
||||
|
||||
#define COMPUTE_n8 {\
|
||||
b_pref = packed_b_pointer + 8 * K;\
|
||||
__asm__ __volatile__(\
|
||||
"vbroadcastsd (%9),%%zmm3;"\
|
||||
"movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\
|
||||
"cmpq $8,%8; jb 42222f;"\
|
||||
"42221:\n\t"\
|
||||
INNER_INIT_m8n8\
|
||||
INNER_KERNELm8(8)\
|
||||
INNER_SAVE_m8n8\
|
||||
"movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\
|
||||
"addq $64,%3;"\
|
||||
"subq $8,%8; cmpq $8,%8; jnb 42221b;"\
|
||||
"42222:\n\t"\
|
||||
"cmpq $4,%8; jb 42223f;"\
|
||||
INNER_INIT_m4n8\
|
||||
INNER_KERNELm4(8)\
|
||||
INNER_SAVE_m4n8\
|
||||
"movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
|
||||
"addq $32,%3;"\
|
||||
"subq $4,%8;"\
|
||||
"42223:\n\t"\
|
||||
"cmpq $2,%8; jb 42224f;"\
|
||||
INNER_INIT_m2n8\
|
||||
INNER_KERNELm2(8)\
|
||||
INNER_SAVE_m2n8\
|
||||
"movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
|
||||
"addq $16,%3;"\
|
||||
"subq $2,%8;"\
|
||||
"42224:\n\t"\
|
||||
"cmpq $1,%8; jb 42225f;"\
|
||||
INNER_INIT_m1n8\
|
||||
INNER_KERNELm1(8)\
|
||||
INNER_SAVE_m1n8\
|
||||
"movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
|
||||
"addq $8,%3;"\
|
||||
"42225:\n\t"\
|
||||
"movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
|
||||
"shlq $3,%4;addq %4,%3;shrq $3,%4;"\
|
||||
:"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\
|
||||
"+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\
|
||||
::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r12","r13","r14");\
|
||||
a_block_pointer -= M * K;\
|
||||
}
|
||||
#define COMPUTE_n16 {\
|
||||
b_pref = packed_b_pointer + 16 * K;\
|
||||
__asm__ __volatile__(\
|
||||
"vbroadcastsd (%9),%%zmm3;"\
|
||||
"movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\
|
||||
"cmpq $8,%8; jb 32222f;"\
|
||||
"32221:\n\t"\
|
||||
INNER_INIT_m8n16\
|
||||
INNER_KERNELm8(16)\
|
||||
INNER_SAVE_m8n16\
|
||||
"movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\
|
||||
"addq $64,%3;"\
|
||||
"subq $8,%8; cmpq $8,%8; jnb 32221b;"\
|
||||
"32222:\n\t"\
|
||||
"cmpq $4,%8; jb 32223f;"\
|
||||
INNER_INIT_m4n16\
|
||||
INNER_KERNELm4(16)\
|
||||
INNER_SAVE_m4n16\
|
||||
"movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
|
||||
"addq $32,%3;"\
|
||||
"subq $4,%8;"\
|
||||
"32223:\n\t"\
|
||||
"cmpq $2,%8; jb 32224f;"\
|
||||
INNER_INIT_m2n16\
|
||||
INNER_KERNELm2(16)\
|
||||
INNER_SAVE_m2n16\
|
||||
"movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
|
||||
"addq $16,%3;"\
|
||||
"subq $2,%8;"\
|
||||
"32224:\n\t"\
|
||||
"cmpq $1,%8; jb 32225f;"\
|
||||
INNER_INIT_m1n16\
|
||||
INNER_KERNELm1(16)\
|
||||
INNER_SAVE_m1n16\
|
||||
"movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
|
||||
"addq $8,%3;"\
|
||||
"32225:\n\t"\
|
||||
"movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
|
||||
"shlq $4,%4;addq %4,%3;shrq $4,%4;"\
|
||||
"leaq (%1,%%r12,4),%1;"\
|
||||
:"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\
|
||||
"+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\
|
||||
::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\
|
||||
"zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r12","r13","r14");\
|
||||
a_block_pointer -= M * K;\
|
||||
}
|
||||
#define COMPUTE_n24 {\
|
||||
b_pref = packed_b_pointer + 24 * K;\
|
||||
__asm__ __volatile__(\
|
||||
"vbroadcastsd (%9),%%zmm3;"\
|
||||
"movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\
|
||||
"cmpq $8,%8; jb 22222f;"\
|
||||
"22221:\n\t"\
|
||||
INNER_INIT_m8n24\
|
||||
INNER_KERNELm8(24)\
|
||||
INNER_SAVE_m8n24\
|
||||
"movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\
|
||||
"addq $64,%3;"\
|
||||
"subq $8,%8; cmpq $8,%8; jnb 22221b;"\
|
||||
"22222:\n\t"\
|
||||
"cmpq $4,%8; jb 22223f;"\
|
||||
INNER_INIT_m4n24\
|
||||
INNER_KERNELm4(24)\
|
||||
INNER_SAVE_m4n24\
|
||||
"movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
|
||||
"addq $32,%3;"\
|
||||
"subq $4,%8;"\
|
||||
"22223:\n\t"\
|
||||
"cmpq $2,%8; jb 22224f;"\
|
||||
INNER_INIT_m2n24\
|
||||
INNER_KERNELm2(24)\
|
||||
INNER_SAVE_m2n24\
|
||||
"movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
|
||||
"addq $16,%3;"\
|
||||
"subq $2,%8;"\
|
||||
"22224:\n\t"\
|
||||
"cmpq $1,%8; jb 22225f;"\
|
||||
INNER_INIT_m1n24\
|
||||
INNER_KERNELm1(24)\
|
||||
INNER_SAVE_m1n24\
|
||||
"movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
|
||||
"addq $8,%3;"\
|
||||
"22225:\n\t"\
|
||||
"movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
|
||||
"shlq $3,%4;addq %4,%3;shlq $1,%4;addq %4,%3;shrq $4,%4;"\
|
||||
"leaq (%1,%%r12,4),%1; leaq (%1,%%r12,2),%1;"\
|
||||
:"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\
|
||||
"+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\
|
||||
::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18","zmm19",\
|
||||
"zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\
|
||||
a_block_pointer -= M * K;\
|
||||
}
|
||||
static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG ndiv8, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=4,ocopy=8
|
||||
//perform C += A<pack> B<pack>
|
||||
if(k==0 || m==0 || ndiv8==0) return;
|
||||
int64_t ldc_in_bytes = (int64_t)LDC * sizeof(double);
|
||||
int64_t K = (int64_t)k; int64_t M = (int64_t)m;
|
||||
double *a_block_pointer,*b_pref;
|
||||
double *c_pointer = c,*c_store = c;
|
||||
__mmask16 k01 = 0x00f0,k02 = 0x000f,k03 = 0x0033;
|
||||
BLASLONG ndiv8_count;
|
||||
double *packed_b_pointer = packed_b;
|
||||
a_block_pointer = packed_a;
|
||||
for(ndiv8_count=ndiv8;ndiv8_count>2;ndiv8_count-=3){
|
||||
COMPUTE_n24
|
||||
}
|
||||
for(;ndiv8_count>1;ndiv8_count-=2){
|
||||
COMPUTE_n16
|
||||
}
|
||||
if(ndiv8_count>0){
|
||||
COMPUTE_n8
|
||||
}
|
||||
}
|
||||
|
||||
/* __m256d accumulators: yc1-yc4; temporary variables: ya1,yb1-yb2 */
|
||||
/* __m128d accumulators: xc1-xc2; temporary variables: xa1,xb1-xb2 */
|
||||
/* double accumulator: sc1; temporary variables: sa1,sb1 */
|
||||
/* column-major c_block */
|
||||
#define KERNEL_m4n4k1 {\
|
||||
ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\
|
||||
yb1 = _mm256_broadcast_sd(b_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\
|
||||
yb2 = _mm256_broadcast_sd(b_block_pointer+1); yc2 = _mm256_fmadd_pd(ya1,yb2,yc2);\
|
||||
yb1 = _mm256_broadcast_sd(b_block_pointer+2); yc3 = _mm256_fmadd_pd(ya1,yb1,yc3);\
|
||||
yb2 = _mm256_broadcast_sd(b_block_pointer+3); yc4 = _mm256_fmadd_pd(ya1,yb2,yc4);\
|
||||
b_block_pointer+=4;\
|
||||
}
|
||||
#define KERNEL_m4n2k1 {\
|
||||
ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\
|
||||
yb1 = _mm256_broadcast_sd(b_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\
|
||||
yb2 = _mm256_broadcast_sd(b_block_pointer+1); yc2 = _mm256_fmadd_pd(ya1,yb2,yc2);\
|
||||
b_block_pointer+=2;\
|
||||
}
|
||||
#define KERNEL_m4n1k1 {\
|
||||
ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\
|
||||
yb1 = _mm256_broadcast_sd(b_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\
|
||||
b_block_pointer++;\
|
||||
}
|
||||
#define INIT_m4n1 yc1=_mm256_setzero_pd();
|
||||
#define INIT_m4n2 yc2=INIT_m4n1
|
||||
#define INIT_m4n4 yc4=yc3=INIT_m4n2
|
||||
#define SAVE_m4n1 {\
|
||||
yb1 = _mm256_broadcast_sd(alpha);\
|
||||
ya1 = _mm256_loadu_pd(c_pointer);\
|
||||
yc1 = _mm256_fmadd_pd(yc1,yb1,ya1);\
|
||||
_mm256_storeu_pd(c_pointer,yc1);\
|
||||
c_pointer += 4;\
|
||||
}
|
||||
#define SAVE_m4n2 {\
|
||||
ya1 = _mm256_broadcast_sd(alpha);\
|
||||
yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\
|
||||
yc1 = _mm256_fmadd_pd(yc1,ya1,yb1); yc2 = _mm256_fmadd_pd(yc2,ya1,yb2);\
|
||||
_mm256_storeu_pd(c_pointer,yc1); _mm256_storeu_pd(c_pointer+LDC,yc2);\
|
||||
c_pointer += 4;\
|
||||
}
|
||||
#define SAVE_m4n4 {\
|
||||
ya1 = _mm256_broadcast_sd(alpha);\
|
||||
yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\
|
||||
yc1 = _mm256_fmadd_pd(yc1,ya1,yb1); yc2 = _mm256_fmadd_pd(yc2,ya1,yb2);\
|
||||
_mm256_storeu_pd(c_pointer,yc1); _mm256_storeu_pd(c_pointer+LDC,yc2);\
|
||||
c_pointer += LDC*2;\
|
||||
yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\
|
||||
yc3 = _mm256_fmadd_pd(yc3,ya1,yb1); yc4 = _mm256_fmadd_pd(yc4,ya1,yb2);\
|
||||
_mm256_storeu_pd(c_pointer,yc3); _mm256_storeu_pd(c_pointer+LDC,yc4);\
|
||||
c_pointer += 4-LDC*2;\
|
||||
}
|
||||
#define KERNEL_m2n2k1 {\
|
||||
xa1 = _mm_loadu_pd(a_block_pointer); a_block_pointer+=2;\
|
||||
xb1 = _mm_loaddup_pd(b_block_pointer); xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\
|
||||
xb2 = _mm_loaddup_pd(b_block_pointer+1); xc2 = _mm_fmadd_pd(xa1,xb2,xc2);\
|
||||
b_block_pointer += 2;\
|
||||
}
|
||||
#define KERNEL_m2n1k1 {\
|
||||
xa1 = _mm_loadu_pd(a_block_pointer); a_block_pointer+=2;\
|
||||
xb1 = _mm_loaddup_pd(b_block_pointer); xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\
|
||||
b_block_pointer ++;\
|
||||
}
|
||||
#define INIT_m2n1 xc1=_mm_setzero_pd();
|
||||
#define INIT_m2n2 xc2=INIT_m2n1
|
||||
#define SAVE_m2n1 {\
|
||||
xb1 = _mm_loaddup_pd(alpha);\
|
||||
xa1 = _mm_loadu_pd(c_pointer);\
|
||||
xc1 = _mm_fmadd_pd(xc1,xb1,xa1);\
|
||||
_mm_storeu_pd(c_pointer,xc1);\
|
||||
c_pointer += 2;\
|
||||
}
|
||||
#define SAVE_m2n2 {\
|
||||
xa1 = _mm_loaddup_pd(alpha);\
|
||||
xb1 = _mm_loadu_pd(c_pointer); xb2 = _mm_loadu_pd(c_pointer+LDC);\
|
||||
xc1 = _mm_fmadd_pd(xc1,xa1,xb1); xc2 = _mm_fmadd_pd(xc2,xa1,xb2);\
|
||||
_mm_storeu_pd(c_pointer,xc1); _mm_storeu_pd(c_pointer+LDC,xc2);\
|
||||
c_pointer += 2;\
|
||||
}
|
||||
#define KERNEL_m1n1k1 {\
|
||||
sa1 = *a_block_pointer; a_block_pointer++;\
|
||||
sb1 = *b_block_pointer; sc1 += sa1 * sb1;\
|
||||
b_block_pointer ++;\
|
||||
}
|
||||
#define INIT_m1n1 sc1=0.0;
|
||||
#define SAVE_m1n1 {\
|
||||
*c_pointer += sc1 * (*alpha);\
|
||||
c_pointer++;\
|
||||
}
|
||||
/* row-major c_block */
|
||||
#define KERNEL_m2n4k1 {\
|
||||
yb1 = _mm256_loadu_pd(b_block_pointer);b_block_pointer+=4;\
|
||||
ya1 = _mm256_broadcast_sd(a_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\
|
||||
ya1 = _mm256_broadcast_sd(a_block_pointer+1);yc2 = _mm256_fmadd_pd(ya1,yb1,yc2);\
|
||||
a_block_pointer += 2;\
|
||||
}
|
||||
#define KERNEL_m1n4k1 {\
|
||||
yb1 = _mm256_loadu_pd(b_block_pointer);b_block_pointer+=4;\
|
||||
ya1 = _mm256_broadcast_sd(a_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\
|
||||
a_block_pointer ++;\
|
||||
}
|
||||
#define KERNEL_m1n2k1 {\
|
||||
xb1 = _mm_loadu_pd(b_block_pointer);b_block_pointer+=2;\
|
||||
xa1 = _mm_loaddup_pd(a_block_pointer); xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\
|
||||
a_block_pointer ++;\
|
||||
}
|
||||
#define INIT_m1n2 INIT_m2n1
|
||||
#define INIT_m1n4 INIT_m4n1
|
||||
#define INIT_m2n4 INIT_m4n2
|
||||
#define SAVE_m2n4 {\
|
||||
ya1 = _mm256_broadcast_sd(alpha);\
|
||||
yc1 = _mm256_mul_pd(yc1,ya1);\
|
||||
yc2 = _mm256_mul_pd(yc2,ya1);\
|
||||
yb1 = _mm256_unpacklo_pd(yc1,yc2);\
|
||||
yb2 = _mm256_unpackhi_pd(yc1,yc2);\
|
||||
xb1 = _mm_add_pd(_mm_loadu_pd(c_pointer),_mm256_extractf128_pd(yb1,0));\
|
||||
xb2 = _mm_add_pd(_mm_loadu_pd(c_pointer+LDC),_mm256_extractf128_pd(yb2,0));\
|
||||
_mm_storeu_pd(c_pointer,xb1);\
|
||||
_mm_storeu_pd(c_pointer+LDC,xb2);\
|
||||
xb1 = _mm_add_pd(_mm_loadu_pd(c_pointer+2*LDC),_mm256_extractf128_pd(yb1,1));\
|
||||
xb2 = _mm_add_pd(_mm_loadu_pd(c_pointer+3*LDC),_mm256_extractf128_pd(yb2,1));\
|
||||
_mm_storeu_pd(c_pointer+2*LDC,xb1);\
|
||||
_mm_storeu_pd(c_pointer+3*LDC,xb2);\
|
||||
c_pointer += 2;\
|
||||
}
|
||||
#define SAVE_m1n2 {\
|
||||
xb1 = _mm_loaddup_pd(alpha);\
|
||||
xc1 = _mm_mul_pd(xc1,xb1);\
|
||||
*c_pointer += _mm_cvtsd_f64(xc1);\
|
||||
xa1 = _mm_unpackhi_pd(xc1,xc1);\
|
||||
c_pointer[LDC]+= _mm_cvtsd_f64(xa1);\
|
||||
c_pointer ++;\
|
||||
}
|
||||
#define SAVE_m1n4 {\
|
||||
ya1 = _mm256_broadcast_sd(alpha);\
|
||||
yc1 = _mm256_mul_pd(yc1,ya1);\
|
||||
xb1 = _mm256_extractf128_pd(yc1,0);\
|
||||
*c_pointer += _mm_cvtsd_f64(xb1);\
|
||||
xb2 = _mm_unpackhi_pd(xb1,xb1);\
|
||||
c_pointer[LDC] += _mm_cvtsd_f64(xb2);\
|
||||
xb1 = _mm256_extractf128_pd(yc1,1);\
|
||||
c_pointer[LDC*2] += _mm_cvtsd_f64(xb1);\
|
||||
xb2 = _mm_unpackhi_pd(xb1,xb1);\
|
||||
c_pointer[LDC*3] += _mm_cvtsd_f64(xb2);\
|
||||
c_pointer ++;\
|
||||
}
|
||||
static void KERNEL_EDGE(double *packed_a, double *packed_b, BLASLONG m, BLASLONG edge_n, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=8,ocopy=8
|
||||
//perform C += A<pack> B<pack> , edge_n<8 must be satisfied.
|
||||
if(k==0 || m==0 || edge_n==0 || (*alpha)==0.0) return;
|
||||
double *a_block_pointer,*b_block_pointer,*b_base_pointer;
|
||||
double *c_pointer = c;
|
||||
__m256d yc1,yc2,yc3,yc4,ya1,yb1,yb2;
|
||||
__m128d xc1,xc2,xa1,xb1,xb2;
|
||||
double sc1,sa1,sb1;
|
||||
BLASLONG m_count,n_count,k_count;
|
||||
b_base_pointer = packed_b;
|
||||
//now start calculation of the edge part
|
||||
for(n_count=edge_n;n_count>3;n_count-=4){
|
||||
a_block_pointer = packed_a;
|
||||
for(m_count=m;m_count>3;m_count-=4){
|
||||
b_block_pointer = b_base_pointer;
|
||||
INIT_m4n4
|
||||
for(k_count=0;k_count<k;k_count++) KERNEL_m4n4k1
|
||||
SAVE_m4n4
|
||||
}
|
||||
for(;m_count>1;m_count-=2){
|
||||
b_block_pointer = b_base_pointer;
|
||||
INIT_m2n4
|
||||
for(k_count=0;k_count<k;k_count++) KERNEL_m2n4k1
|
||||
SAVE_m2n4
|
||||
}
|
||||
if(m_count>0){
|
||||
b_block_pointer = b_base_pointer;
|
||||
INIT_m1n4
|
||||
for(k_count=0;k_count<k;k_count++) KERNEL_m1n4k1
|
||||
SAVE_m1n4
|
||||
}
|
||||
b_base_pointer += 4*k;
|
||||
c_pointer += 4 * LDC - m;
|
||||
}
|
||||
for(;n_count>1;n_count-=2){
|
||||
a_block_pointer = packed_a;
|
||||
for(m_count=m;m_count>3;m_count-=4){
|
||||
b_block_pointer = b_base_pointer;
|
||||
INIT_m4n2
|
||||
for(k_count=0;k_count<k;k_count++) KERNEL_m4n2k1
|
||||
SAVE_m4n2
|
||||
}
|
||||
for(;m_count>1;m_count-=2){
|
||||
b_block_pointer = b_base_pointer;
|
||||
INIT_m2n2
|
||||
for(k_count=0;k_count<k;k_count++) KERNEL_m2n2k1
|
||||
SAVE_m2n2
|
||||
}
|
||||
if(m_count>0){
|
||||
b_block_pointer = b_base_pointer;
|
||||
INIT_m1n2
|
||||
for(k_count=0;k_count<k;k_count++) KERNEL_m1n2k1
|
||||
SAVE_m1n2
|
||||
}
|
||||
b_base_pointer += 2*k;
|
||||
c_pointer += 2 * LDC - m;
|
||||
}
|
||||
if(n_count>0){
|
||||
a_block_pointer = packed_a;
|
||||
for(m_count=m;m_count>3;m_count-=4){
|
||||
b_block_pointer = b_base_pointer;
|
||||
INIT_m4n1
|
||||
for(k_count=0;k_count<k;k_count++) KERNEL_m4n1k1
|
||||
SAVE_m4n1
|
||||
}
|
||||
for(;m_count>1;m_count-=2){
|
||||
b_block_pointer = b_base_pointer;
|
||||
INIT_m2n1
|
||||
for(k_count=0;k_count<k;k_count++) KERNEL_m2n1k1
|
||||
SAVE_m2n1
|
||||
}
|
||||
if(m_count>0){
|
||||
b_block_pointer = b_base_pointer;
|
||||
INIT_m1n1
|
||||
for(k_count=0;k_count<k;k_count++) KERNEL_m1n1k1
|
||||
SAVE_m1n1
|
||||
}
|
||||
}
|
||||
}
|
||||
int __attribute__ ((noinline)) CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, double * __restrict__ B, double * __restrict__ C, BLASLONG ldc){
|
||||
if(m==0 || n==0 || k==0 || alpha == 0.0) return 0;
|
||||
BLASLONG ndiv8 = n/8;double ALPHA = alpha;
|
||||
double *packed_a = A;
|
||||
if(ndiv8>0) KERNEL_MAIN(packed_a,B,m,ndiv8,k,ldc,C,&ALPHA);
|
||||
if(n>ndiv8*8) KERNEL_EDGE(packed_a,B+(int64_t)k*(int64_t)ndiv8*8,m,n-ndiv8*8,k,ldc,C+(int64_t)ldc*(int64_t)ndiv8*8,&ALPHA);
|
||||
return 0;
|
||||
}
|
||||
782
kernel/x86_64/dgemm_kernel_8x8_skylakex.c
Normal file
782
kernel/x86_64/dgemm_kernel_8x8_skylakex.c
Normal file
@@ -0,0 +1,782 @@
|
||||
#include "common.h"
|
||||
#include <stdint.h>
|
||||
#include <immintrin.h>
|
||||
|
||||
#define ICOPY_4
|
||||
//register usage: zmm3 for alpha, zmm4-zmm7 for temporary use, zmm8-zmm31 for accumulators.
|
||||
/* row-major c_block */
|
||||
#define INNER_KERNEL_k1m1n8 \
|
||||
"prefetcht0 384(%1);"\
|
||||
"prefetcht0 768(%0); vmovupd (%1),%%zmm5; addq $64,%1;"\
|
||||
"vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8;"
|
||||
|
||||
#define INNER_KERNEL_k1m2n8 \
|
||||
INNER_KERNEL_k1m1n8\
|
||||
"vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm9;"
|
||||
|
||||
#define INNER_KERNEL_k1m4n8 \
|
||||
INNER_KERNEL_k1m2n8\
|
||||
"vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;"\
|
||||
"vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;"
|
||||
|
||||
#define INNER_KERNEL_k1m8n8 \
|
||||
INNER_KERNEL_k1m4n8\
|
||||
"vbroadcastsd 32(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm12;"\
|
||||
"vbroadcastsd 40(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm13;"\
|
||||
"vbroadcastsd 48(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;"\
|
||||
"vbroadcastsd 56(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm15;"
|
||||
|
||||
#define INNER_KERNEL_k1m1n16 \
|
||||
"prefetcht0 128(%1); prefetcht0 128(%1,%%r12,1);"\
|
||||
"prefetcht0 768(%0); vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,1),%%zmm6; addq $64,%1;"\
|
||||
"vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9;"
|
||||
|
||||
#define INNER_KERNEL_k1m2n16 \
|
||||
INNER_KERNEL_k1m1n16\
|
||||
"vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;vfmadd231pd %%zmm6,%%zmm4,%%zmm11;"
|
||||
|
||||
#define INNER_KERNEL_k1m4n16 \
|
||||
INNER_KERNEL_k1m2n16\
|
||||
"vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm12;vfmadd231pd %%zmm6,%%zmm4,%%zmm13;"\
|
||||
"vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;vfmadd231pd %%zmm6,%%zmm4,%%zmm15;"
|
||||
|
||||
#define INNER_KERNEL_k1m8n16 \
|
||||
INNER_KERNEL_k1m4n16\
|
||||
"vbroadcastsd 32(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm16;vfmadd231pd %%zmm6,%%zmm4,%%zmm17;"\
|
||||
"vbroadcastsd 40(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm18;vfmadd231pd %%zmm6,%%zmm4,%%zmm19;"\
|
||||
"vbroadcastsd 48(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm20;vfmadd231pd %%zmm6,%%zmm4,%%zmm21;"\
|
||||
"vbroadcastsd 56(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm22;vfmadd231pd %%zmm6,%%zmm4,%%zmm23;"
|
||||
|
||||
#define INNER_KERNEL_k1m1n24 \
|
||||
"prefetcht0 128(%1); prefetcht0 128(%1,%%r12,1); prefetcht0 128(%1,%%r12,2);"\
|
||||
"prefetcht0 768(%0); vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,1),%%zmm6; vmovupd (%1,%%r12,2),%%zmm7; addq $64,%1;"\
|
||||
"vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9; vfmadd231pd %%zmm7,%%zmm4,%%zmm10;"
|
||||
|
||||
#define INNER_KERNEL_k1m2n24 \
|
||||
INNER_KERNEL_k1m1n24\
|
||||
"vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;vfmadd231pd %%zmm6,%%zmm4,%%zmm12;vfmadd231pd %%zmm7,%%zmm4,%%zmm13;"
|
||||
|
||||
#define INNER_KERNEL_k1m4n24 \
|
||||
INNER_KERNEL_k1m2n24\
|
||||
"vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;vfmadd231pd %%zmm6,%%zmm4,%%zmm15;vfmadd231pd %%zmm7,%%zmm4,%%zmm16;"\
|
||||
"vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm17;vfmadd231pd %%zmm6,%%zmm4,%%zmm18;vfmadd231pd %%zmm7,%%zmm4,%%zmm19;"
|
||||
|
||||
#define INNER_KERNEL_k1m8n24 \
|
||||
INNER_KERNEL_k1m4n24\
|
||||
"vbroadcastsd 32(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm20;vfmadd231pd %%zmm6,%%zmm4,%%zmm21;vfmadd231pd %%zmm7,%%zmm4,%%zmm22;"\
|
||||
"vbroadcastsd 40(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm23;vfmadd231pd %%zmm6,%%zmm4,%%zmm24;vfmadd231pd %%zmm7,%%zmm4,%%zmm25;"\
|
||||
"vbroadcastsd 48(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm26;vfmadd231pd %%zmm6,%%zmm4,%%zmm27;vfmadd231pd %%zmm7,%%zmm4,%%zmm28;"\
|
||||
"vbroadcastsd 56(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm29;vfmadd231pd %%zmm6,%%zmm4,%%zmm30;vfmadd231pd %%zmm7,%%zmm4,%%zmm31;"
|
||||
|
||||
#define INNER_KERNELm1(nn) \
|
||||
"cmpq $1,%2;jb "#nn"3f;"\
|
||||
#nn"4:\n\t"\
|
||||
INNER_KERNEL_k1m1n##nn "addq $8,%0;"\
|
||||
"decq %2;cmpq $1,%2;jnb "#nn"4b;"\
|
||||
#nn"3:\n\t"
|
||||
|
||||
#define INNER_KERNELm2(nn) \
|
||||
"cmpq $1,%2;jb "#nn"0f;"\
|
||||
#nn"1:\n\t"\
|
||||
INNER_KERNEL_k1m2n##nn "addq $16,%0;"\
|
||||
"decq %2;cmpq $1,%2;jnb "#nn"1b;"\
|
||||
#nn"0:\n\t"
|
||||
|
||||
#define INNER_KERNELm4(nn) \
|
||||
"cmpq $1,%2;jb "#nn"00f;"\
|
||||
#nn"01:\n\t"\
|
||||
INNER_KERNEL_k1m4n##nn "addq $32,%0;"\
|
||||
"decq %2;cmpq $1,%2;jnb "#nn"01b;"\
|
||||
#nn"00:\n\t"
|
||||
|
||||
#define INNER_KERNELm8(nn) \
|
||||
"cmpq $8,%2;jb "#nn"001f;"\
|
||||
#nn"008:\n\t"\
|
||||
INNER_KERNEL_k1m8n##nn "addq $64,%0;"\
|
||||
INNER_KERNEL_k1m8n##nn "addq $64,%0;"\
|
||||
INNER_KERNEL_k1m8n##nn "addq $64,%0;"\
|
||||
INNER_KERNEL_k1m8n##nn "addq $64,%0;"\
|
||||
INNER_KERNEL_k1m8n##nn "addq $64,%0;"\
|
||||
INNER_KERNEL_k1m8n##nn "addq $64,%0;"\
|
||||
INNER_KERNEL_k1m8n##nn "addq $64,%0;"\
|
||||
INNER_KERNEL_k1m8n##nn "addq $64,%0;"\
|
||||
"subq $8,%2;cmpq $8,%2;jnb "#nn"008b;"\
|
||||
#nn"001:\n\t"\
|
||||
"cmpq $1,%2;jb "#nn"000f;"\
|
||||
INNER_KERNEL_k1m8n##nn "addq $64,%0;"\
|
||||
"decq %2;cmpq $1,%2;jnb "#nn"001b;"\
|
||||
""#nn"000:\n\t"
|
||||
|
||||
#define INNER_INIT_m1n8 \
|
||||
"vpxorq %%zmm8, %%zmm8, %%zmm8;"
|
||||
|
||||
#define INNER_INIT_m2n8 \
|
||||
"vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9;"
|
||||
|
||||
#define INNER_INIT_m4n8 \
|
||||
"vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9; vpxorq %%zmm10,%%zmm10,%%zmm10;vpxorq %%zmm11,%%zmm11,%%zmm11;"
|
||||
|
||||
#define INNER_INIT_m8n8 \
|
||||
INNER_INIT_m4n8\
|
||||
"vpxorq %%zmm12,%%zmm12,%%zmm12;vpxorq %%zmm13,%%zmm13,%%zmm13;vpxorq %%zmm14,%%zmm14,%%zmm14;vpxorq %%zmm15,%%zmm15,%%zmm15;"
|
||||
|
||||
#define INNER_INIT_m1n16 INNER_INIT_m2n8
|
||||
|
||||
#define INNER_INIT_m2n16 INNER_INIT_m4n8
|
||||
|
||||
#define INNER_INIT_m4n16 INNER_INIT_m8n8
|
||||
|
||||
#define INNER_INIT_m8n16 \
|
||||
INNER_INIT_m8n8\
|
||||
"vpxorq %%zmm16,%%zmm16,%%zmm16;vpxorq %%zmm17,%%zmm17,%%zmm17;vpxorq %%zmm18,%%zmm18,%%zmm18;vpxorq %%zmm19,%%zmm19,%%zmm19;"\
|
||||
"vpxorq %%zmm20,%%zmm20,%%zmm20;vpxorq %%zmm21,%%zmm21,%%zmm21;vpxorq %%zmm22,%%zmm22,%%zmm22;vpxorq %%zmm23,%%zmm23,%%zmm23;"
|
||||
|
||||
#define INNER_INIT_m1n24 \
|
||||
"vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9; vpxorq %%zmm10,%%zmm10,%%zmm10;"
|
||||
|
||||
#define INNER_INIT_m2n24 \
|
||||
INNER_INIT_m1n24\
|
||||
"vpxorq %%zmm11,%%zmm11,%%zmm11; vpxorq %%zmm12,%%zmm12,%%zmm12; vpxorq %%zmm13,%%zmm13,%%zmm13;"
|
||||
|
||||
#define INNER_INIT_m4n24 \
|
||||
INNER_INIT_m4n16\
|
||||
"vpxorq %%zmm16,%%zmm16,%%zmm16;vpxorq %%zmm17,%%zmm17,%%zmm17;vpxorq %%zmm18,%%zmm18,%%zmm18;vpxorq %%zmm19,%%zmm19,%%zmm19;"
|
||||
|
||||
#define INNER_INIT_m8n24 \
|
||||
INNER_INIT_m8n16\
|
||||
"vpxorq %%zmm24,%%zmm24,%%zmm24;vpxorq %%zmm25,%%zmm25,%%zmm25;vpxorq %%zmm26,%%zmm26,%%zmm26;vpxorq %%zmm27,%%zmm27,%%zmm27;"\
|
||||
"vpxorq %%zmm28,%%zmm28,%%zmm28;vpxorq %%zmm29,%%zmm29,%%zmm29;vpxorq %%zmm30,%%zmm30,%%zmm30;vpxorq %%zmm31,%%zmm31,%%zmm31;"
|
||||
|
||||
#define INNER_SETINDEX \
|
||||
"vpinsrq $0,%4,%%xmm4,%%xmm4; vbroadcastsd %%xmm4,%%zmm4;"\
|
||||
"kxnorw %%k1,%%k1,%%k1; kshiftlw $1,%%k1,%%k1; vpxorq %%zmm6,%%zmm6,%%zmm6; vmovapd %%zmm4,%%zmm6%{%%k1%};"\
|
||||
"kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\
|
||||
"kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\
|
||||
"kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\
|
||||
"kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\
|
||||
"kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\
|
||||
"kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"
|
||||
|
||||
#define INNER_STORE_m1n8(c1,disp) \
|
||||
"kxnorw %%k1,%%k1,%%k1;"\
|
||||
"vgatherqpd "#disp"(%3,%%zmm6,1), %%zmm7 %{%%k1%};"\
|
||||
"vfmadd132pd %%zmm3,%%zmm7,"#c1";"\
|
||||
"kxnorw %%k1,%%k1,%%k1;"\
|
||||
"vscatterqpd "#c1", "#disp"(%3,%%zmm6,1) %{%%k1%};"
|
||||
|
||||
#define INNER_SAVE_m1n8 \
|
||||
INNER_SETINDEX\
|
||||
INNER_STORE_m1n8(%%zmm8,0)
|
||||
|
||||
#define INNER_SAVE_m1n16 \
|
||||
INNER_SAVE_m1n8\
|
||||
"leaq (%3,%4,8),%3;"\
|
||||
INNER_STORE_m1n8(%%zmm9,0)
|
||||
|
||||
#define INNER_SAVE_m1n24 \
|
||||
INNER_SAVE_m1n16\
|
||||
"leaq (%3,%4,8),%3;"\
|
||||
INNER_STORE_m1n8(%%zmm10,0)
|
||||
|
||||
#define INNER_SAVE_m2n8 \
|
||||
INNER_SETINDEX\
|
||||
INNER_STORE_m1n8(%%zmm8,0)\
|
||||
INNER_STORE_m1n8(%%zmm9,8)
|
||||
|
||||
#define INNER_SAVE_m2n16 \
|
||||
INNER_SETINDEX\
|
||||
INNER_STORE_m1n8(%%zmm8,0)\
|
||||
INNER_STORE_m1n8(%%zmm10,8)\
|
||||
"leaq (%3,%4,8),%3;"\
|
||||
INNER_STORE_m1n8(%%zmm9,0)\
|
||||
INNER_STORE_m1n8(%%zmm11,8)
|
||||
|
||||
#define INNER_SAVE_m2n24 \
|
||||
INNER_SETINDEX\
|
||||
INNER_STORE_m1n8(%%zmm8,0)\
|
||||
INNER_STORE_m1n8(%%zmm11,8)\
|
||||
"leaq (%3,%4,8),%3;"\
|
||||
INNER_STORE_m1n8(%%zmm9,0)\
|
||||
INNER_STORE_m1n8(%%zmm12,8)\
|
||||
"leaq (%3,%4,8),%3;"\
|
||||
INNER_STORE_m1n8(%%zmm10,0)\
|
||||
INNER_STORE_m1n8(%%zmm13,8)
|
||||
|
||||
#define INNER_PREF_8x8 \
|
||||
"prefetcht0 (%3); prefetcht0 56(%3); prefetcht0 (%3,%4,1); prefetcht0 56(%3,%4,1); prefetcht0 (%3,%4,2); prefetcht0 56(%3,%4,2);"\
|
||||
"prefetcht0 (%3,%4,4); prefetcht0 56(%3,%4,4); leaq (%3,%4,2),%3;"\
|
||||
"prefetcht0 (%3,%4,1); prefetcht0 56(%3,%4,1); prefetcht0 (%3,%4,4); prefetcht0 56(%3,%4,4); leaq (%3,%4,1),%3;"\
|
||||
"prefetcht0 (%3,%4,2); prefetcht0 56(%3,%4,2); prefetcht0 (%3,%4,4); prefetcht0 56(%3,%4,4);"\
|
||||
"subq %4,%3; subq %4,%3; subq %4,%3;"
|
||||
|
||||
#define INNER_TRANS_4x8(c1,c2,c3,c4) \
|
||||
"vunpcklpd "#c2","#c1",%%zmm4;vunpckhpd "#c2","#c1",%%zmm5;vunpcklpd "#c4","#c3",%%zmm6;vunpckhpd "#c4","#c3",%%zmm7;"\
|
||||
"vblendmpd %%zmm6,%%zmm4,"#c1"%{%6%};vblendmpd %%zmm7,%%zmm5,"#c3"%{%6%};"\
|
||||
"vshuff64x2 $0xb1,"#c1","#c1","#c1";vshuff64x2 $0xb1,"#c3","#c3","#c3";"\
|
||||
"vblendmpd %%zmm4,"#c1",%%zmm4%{%6%};vblendmpd %%zmm5,"#c3","#c2"%{%6%};"\
|
||||
"vblendmpd "#c1",%%zmm6,%%zmm6%{%6%};vblendmpd "#c3",%%zmm7,"#c4"%{%6%};"\
|
||||
"vmovapd %%zmm4,"#c1"; vmovapd %%zmm6,"#c3";"
|
||||
|
||||
#define INNER_TRANS_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \
|
||||
INNER_TRANS_4x8(c1,c2,c3,c4)\
|
||||
INNER_TRANS_4x8(c5,c6,c7,c8)\
|
||||
"vblendmpd "#c5","#c1",%%zmm4%{%5%};vshuff64x2 $0x4e,%%zmm4,%%zmm4,%%zmm4;"\
|
||||
"vblendmpd "#c1",%%zmm4,"#c1"%{%5%};vblendmpd %%zmm4,"#c5","#c5"%{%5%};"\
|
||||
"vblendmpd "#c6","#c2",%%zmm5%{%5%};vshuff64x2 $0x4e,%%zmm5,%%zmm5,%%zmm5;"\
|
||||
"vblendmpd "#c2",%%zmm5,"#c2"%{%5%};vblendmpd %%zmm5,"#c6","#c6"%{%5%};"\
|
||||
"vblendmpd "#c7","#c3",%%zmm6%{%5%};vshuff64x2 $0x4e,%%zmm6,%%zmm6,%%zmm6;"\
|
||||
"vblendmpd "#c3",%%zmm6,"#c3"%{%5%};vblendmpd %%zmm6,"#c7","#c7"%{%5%};"\
|
||||
"vblendmpd "#c8","#c4",%%zmm7%{%5%};vshuff64x2 $0x4e,%%zmm7,%%zmm7,%%zmm7;"\
|
||||
"vblendmpd "#c4",%%zmm7,"#c4"%{%5%};vblendmpd %%zmm7,"#c8","#c8"%{%5%};"
|
||||
|
||||
//%7 for k01(input) only when m=4
|
||||
#define INNER_STORE_4x8(c1,c2,c3,c4) \
|
||||
"vmovupd (%3),%%zmm4%{%5%};vmovupd -32(%3,%4,4),%%zmm4%{%7%};vfmadd132pd %%zmm3,%%zmm4,"#c1";"\
|
||||
"vmovupd "#c1",(%3)%{%5%}; vmovupd "#c1",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\
|
||||
"vmovupd (%3),%%zmm5%{%5%};vmovupd -32(%3,%4,4),%%zmm5%{%7%};vfmadd132pd %%zmm3,%%zmm5,"#c2";"\
|
||||
"vmovupd "#c2",(%3)%{%5%}; vmovupd "#c2",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\
|
||||
"vmovupd (%3),%%zmm6%{%5%};vmovupd -32(%3,%4,4),%%zmm6%{%7%};vfmadd132pd %%zmm3,%%zmm6,"#c3";"\
|
||||
"vmovupd "#c3",(%3)%{%5%}; vmovupd "#c3",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\
|
||||
"vmovupd (%3),%%zmm7%{%5%};vmovupd -32(%3,%4,4),%%zmm7%{%7%};vfmadd132pd %%zmm3,%%zmm7,"#c4";"\
|
||||
"vmovupd "#c4",(%3)%{%5%}; vmovupd "#c4",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\
|
||||
"leaq (%3,%4,4),%3;"
|
||||
|
||||
#define INNER_STORE_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \
|
||||
"prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\
|
||||
"vfmadd213pd (%3),%%zmm3,"#c1"; vmovupd "#c1",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c2"; vmovupd "#c2",(%3,%4,1); leaq (%3,%4,2),%3;"\
|
||||
"prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\
|
||||
"vfmadd213pd (%3),%%zmm3,"#c3"; vmovupd "#c3",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c4"; vmovupd "#c4",(%3,%4,1); leaq (%3,%4,2),%3;"\
|
||||
"prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\
|
||||
"vfmadd213pd (%3),%%zmm3,"#c5"; vmovupd "#c5",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c6"; vmovupd "#c6",(%3,%4,1); leaq (%3,%4,2),%3;"\
|
||||
"prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\
|
||||
"vfmadd213pd (%3),%%zmm3,"#c7"; vmovupd "#c7",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c8"; vmovupd "#c8",(%3,%4,1); leaq (%3,%4,2),%3;"
|
||||
|
||||
#define INNER_SAVE_m4n8 \
|
||||
INNER_TRANS_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11)\
|
||||
INNER_STORE_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11)
|
||||
|
||||
#define INNER_SAVE_m4n16 \
|
||||
INNER_TRANS_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\
|
||||
INNER_STORE_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\
|
||||
INNER_TRANS_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15)\
|
||||
INNER_STORE_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15)
|
||||
|
||||
#define INNER_SAVE_m4n24 \
|
||||
INNER_TRANS_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\
|
||||
INNER_STORE_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\
|
||||
INNER_TRANS_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\
|
||||
INNER_STORE_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\
|
||||
INNER_TRANS_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19)\
|
||||
INNER_STORE_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19)
|
||||
|
||||
#define INNER_SAVE_m8n8 \
|
||||
INNER_PREF_8x8\
|
||||
INNER_TRANS_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15)\
|
||||
INNER_STORE_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15)
|
||||
|
||||
#define INNER_SAVE_m8n16 \
|
||||
INNER_PREF_8x8\
|
||||
INNER_TRANS_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\
|
||||
INNER_STORE_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\
|
||||
INNER_PREF_8x8\
|
||||
INNER_TRANS_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23)\
|
||||
INNER_STORE_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23)
|
||||
|
||||
#define INNER_SAVE_m8n24 \
|
||||
INNER_PREF_8x8\
|
||||
INNER_TRANS_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\
|
||||
INNER_STORE_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\
|
||||
INNER_PREF_8x8\
|
||||
INNER_TRANS_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\
|
||||
INNER_STORE_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\
|
||||
INNER_PREF_8x8\
|
||||
INNER_TRANS_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31)\
|
||||
INNER_STORE_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31)
|
||||
|
||||
#define COMPUTE_n8 {\
|
||||
__asm__ __volatile__(\
|
||||
"vbroadcastsd (%9),%%zmm3;"\
|
||||
"movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $6,%%r12;"\
|
||||
"cmpq $8,%8; jb 42222f;"\
|
||||
"42221:\n\t"\
|
||||
INNER_INIT_m8n8\
|
||||
INNER_KERNELm8(8)\
|
||||
INNER_SAVE_m8n8\
|
||||
"movq %%r13,%2; subq %%r12,%1;"\
|
||||
"shlq $3,%4;subq %4,%3;shrq $3,%4;addq $64,%3;"\
|
||||
"subq $8,%8; cmpq $8,%8; jnb 42221b;"\
|
||||
"42222:\n\t"\
|
||||
"cmpq $4,%8; jb 42223f;"\
|
||||
INNER_INIT_m4n8\
|
||||
INNER_KERNELm4(8)\
|
||||
INNER_SAVE_m4n8\
|
||||
"movq %%r13,%2; subq %%r12,%1;"\
|
||||
"shlq $3,%4;subq %4,%3;shrq $3,%4;addq $32,%3;"\
|
||||
"subq $4,%8;"\
|
||||
"42223:\n\t"\
|
||||
"cmpq $2,%8; jb 42224f;"\
|
||||
INNER_INIT_m2n8\
|
||||
INNER_KERNELm2(8)\
|
||||
INNER_SAVE_m2n8\
|
||||
"movq %%r13,%2; subq %%r12,%1;"\
|
||||
"addq $16,%3;"\
|
||||
"subq $2,%8;"\
|
||||
"42224:\n\t"\
|
||||
"cmpq $1,%8; jb 42225f;"\
|
||||
INNER_INIT_m1n8\
|
||||
INNER_KERNELm1(8)\
|
||||
INNER_SAVE_m1n8\
|
||||
"movq %%r13,%2; subq %%r12,%1;"\
|
||||
"addq $8,%3;"\
|
||||
"42225:\n\t"\
|
||||
"movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
|
||||
"shlq $3,%4;addq %4,%3;shrq $3,%4;"\
|
||||
:"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(alpha)\
|
||||
::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r12","r13","r14");\
|
||||
a_block_pointer -= M * K;\
|
||||
}
|
||||
#define COMPUTE_n16 {\
|
||||
__asm__ __volatile__(\
|
||||
"vbroadcastsd (%9),%%zmm3;"\
|
||||
"movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $6,%%r12;"\
|
||||
"cmpq $8,%8; jb 32222f;"\
|
||||
"32221:\n\t"\
|
||||
INNER_INIT_m8n16\
|
||||
INNER_KERNELm8(16)\
|
||||
INNER_SAVE_m8n16\
|
||||
"movq %%r13,%2; subq %%r12,%1;"\
|
||||
"shlq $4,%4;subq %4,%3;shrq $4,%4;addq $64,%3;"\
|
||||
"subq $8,%8; cmpq $8,%8; jnb 32221b;"\
|
||||
"32222:\n\t"\
|
||||
"cmpq $4,%8; jb 32223f;"\
|
||||
INNER_INIT_m4n16\
|
||||
INNER_KERNELm4(16)\
|
||||
INNER_SAVE_m4n16\
|
||||
"movq %%r13,%2; subq %%r12,%1;"\
|
||||
"shlq $4,%4;subq %4,%3;shrq $4,%4;addq $32,%3;"\
|
||||
"subq $4,%8;"\
|
||||
"32223:\n\t"\
|
||||
"cmpq $2,%8; jb 32224f;"\
|
||||
INNER_INIT_m2n16\
|
||||
INNER_KERNELm2(16)\
|
||||
INNER_SAVE_m2n16\
|
||||
"movq %%r13,%2; subq %%r12,%1;"\
|
||||
"shlq $3,%4;subq %4,%3;shrq $3,%4;addq $16,%3;"\
|
||||
"subq $2,%8;"\
|
||||
"32224:\n\t"\
|
||||
"cmpq $1,%8; jb 32225f;"\
|
||||
INNER_INIT_m1n16\
|
||||
INNER_KERNELm1(16)\
|
||||
INNER_SAVE_m1n16\
|
||||
"movq %%r13,%2; subq %%r12,%1;"\
|
||||
"shlq $3,%4;subq %4,%3;shrq $3,%4;addq $8,%3;"\
|
||||
"32225:\n\t"\
|
||||
"movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
|
||||
"shlq $4,%4;addq %4,%3;shrq $4,%4;"\
|
||||
"leaq (%1,%%r12,2),%1;"\
|
||||
:"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(alpha)\
|
||||
::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\
|
||||
"zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r12","r13","r14");\
|
||||
a_block_pointer -= M * K;\
|
||||
}
|
||||
#define COMPUTE_n24 {\
|
||||
__asm__ __volatile__(\
|
||||
"vbroadcastsd (%9),%%zmm3;"\
|
||||
"movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $6,%%r12;"\
|
||||
"cmpq $8,%8; jb 22222f;"\
|
||||
"22221:\n\t"\
|
||||
INNER_INIT_m8n24\
|
||||
INNER_KERNELm8(24)\
|
||||
INNER_SAVE_m8n24\
|
||||
"movq %%r13,%2; subq %%r12,%1;"\
|
||||
"shlq $3,%4;subq %4,%3;shlq $1,%4;subq %4,%3;shrq $4,%4;addq $64,%3;"\
|
||||
"subq $8,%8; cmpq $8,%8; jnb 22221b;"\
|
||||
"22222:\n\t"\
|
||||
"cmpq $4,%8; jb 22223f;"\
|
||||
INNER_INIT_m4n24\
|
||||
INNER_KERNELm4(24)\
|
||||
INNER_SAVE_m4n24\
|
||||
"movq %%r13,%2; subq %%r12,%1;"\
|
||||
"shlq $3,%4;subq %4,%3;shlq $1,%4;subq %4,%3;shrq $4,%4;addq $32,%3;"\
|
||||
"subq $4,%8;"\
|
||||
"22223:\n\t"\
|
||||
"cmpq $2,%8; jb 22224f;"\
|
||||
INNER_INIT_m2n24\
|
||||
INNER_KERNELm2(24)\
|
||||
INNER_SAVE_m2n24\
|
||||
"movq %%r13,%2; subq %%r12,%1;"\
|
||||
"shlq $4,%4;subq %4,%3;shrq $4,%4;addq $16,%3;"\
|
||||
"subq $2,%8;"\
|
||||
"22224:\n\t"\
|
||||
"cmpq $1,%8; jb 22225f;"\
|
||||
INNER_INIT_m1n24\
|
||||
INNER_KERNELm1(24)\
|
||||
INNER_SAVE_m1n24\
|
||||
"movq %%r13,%2; subq %%r12,%1;"\
|
||||
"shlq $4,%4;subq %4,%3;shrq $4,%4;addq $8,%3;"\
|
||||
"22225:\n\t"\
|
||||
"movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
|
||||
"shlq $3,%4;addq %4,%3;shlq $1,%4;addq %4,%3;shrq $4,%4;"\
|
||||
"leaq (%1,%%r12,2),%1; addq %%r12,%1;"\
|
||||
:"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(alpha)\
|
||||
::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18","zmm19",\
|
||||
"zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\
|
||||
a_block_pointer -= M * K;\
|
||||
}
|
||||
|
||||
static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG ndiv8, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=8,ocopy=8
|
||||
//perform C += A<pack> B<pack>
|
||||
if(k==0 || m==0 || ndiv8==0) return;
|
||||
int64_t ldc_in_bytes = (int64_t)LDC * sizeof(double);
|
||||
int64_t K = (int64_t)k; int64_t M = (int64_t)m;
|
||||
double *a_block_pointer;
|
||||
double *c_pointer = c;
|
||||
__mmask16 k01 = 0x00f0,k02 = 0x000f,k03 = 0x0033;
|
||||
BLASLONG m_count,ndiv8_count,k_count;
|
||||
double *packed_b_pointer = packed_b;
|
||||
a_block_pointer = packed_a;
|
||||
for(ndiv8_count=ndiv8;ndiv8_count>2;ndiv8_count-=3){
|
||||
COMPUTE_n24
|
||||
}
|
||||
for(;ndiv8_count>1;ndiv8_count-=2){
|
||||
COMPUTE_n16
|
||||
}
|
||||
if(ndiv8_count>0){
|
||||
COMPUTE_n8
|
||||
}
|
||||
}
|
||||
|
||||
/* __m512d accumulators: zc1-zc4; temporary variables: za1,zb1-zb2 */
|
||||
/* __m256d accumulators: yc1-yc4; temporary variables: ya1,yb1-yb2 */
|
||||
/* __m128d accumulators: xc1-xc4; temporary variables: xa1,xb1-xb2 */
|
||||
/* double accumulator: sc1; temporary variables: sa1,sb1 */
|
||||
/* column-major c_block */
|
||||
#define KERNEL_m8n4k1 {\
|
||||
__asm__ __volatile__(\
|
||||
"vmovupd (%0),%2; addq $64,%0;"\
|
||||
"vbroadcastsd (%1),%3; vfmadd231pd %2,%3,%5; "\
|
||||
"vbroadcastsd 8(%1),%4; vfmadd231pd %2,%4,%6; "\
|
||||
"vbroadcastsd 16(%1),%3; vfmadd231pd %2,%3,%7; "\
|
||||
"vbroadcastsd 24(%1),%4; vfmadd231pd %2,%4,%8; "\
|
||||
"addq $32,%1;"\
|
||||
:"+r"(a_block_pointer),"+r"(b_block_pointer),"+v"(za1),"+v"(zb1),"+v"(zb2),"+v"(zc1),"+v"(zc2),"+v"(zc3),"+v"(zc4)::"cc","memory");\
|
||||
}
|
||||
#define KERNEL_m8n2k1 {\
|
||||
__asm__ __volatile__(\
|
||||
"vmovupd (%0),%2; addq $64,%0;"\
|
||||
"vbroadcastsd (%1),%3; vfmadd231pd %2,%3,%5; "\
|
||||
"vbroadcastsd 8(%1),%4; vfmadd231pd %2,%4,%6; "\
|
||||
"addq $16,%1;"\
|
||||
:"+r"(a_block_pointer),"+r"(b_block_pointer),"+v"(za1),"+v"(zb1),"+v"(zb2),"+v"(zc1),"+v"(zc2)::"cc","memory");\
|
||||
}
|
||||
#define KERNEL_m8n1k1 {\
|
||||
__asm__ __volatile__(\
|
||||
"vmovupd (%0),%2; addq $64,%0;"\
|
||||
"vbroadcastsd (%1),%3; vfmadd231pd %2,%3,%4; "\
|
||||
"addq $8,%1;"\
|
||||
:"+r"(a_block_pointer),"+r"(b_block_pointer),"+v"(za1),"+v"(zb1),"+v"(zc1)::"cc","memory");\
|
||||
}
|
||||
#define INIT_m8n1 zc1=_mm512_setzero_pd();
|
||||
#define INIT_m8n2 zc2=INIT_m8n1
|
||||
#define INIT_m8n4 zc4=zc3=INIT_m8n2
|
||||
#define SAVE_m8n1 {\
|
||||
__asm__ __volatile__("vbroadcastsd (%0),%1;":"+r"(alpha),"+v"(za1)::"memory");\
|
||||
zb1 = _mm512_loadu_pd(c_pointer);\
|
||||
zc1 = _mm512_fmadd_pd(zc1,za1,zb1);\
|
||||
_mm512_storeu_pd(c_pointer,zc1);\
|
||||
c_pointer += 8;\
|
||||
}
|
||||
#define SAVE_m8n2 {\
|
||||
__asm__ __volatile__("vbroadcastsd (%0),%1;":"+r"(alpha),"+v"(za1)::"memory");\
|
||||
zb1 = _mm512_loadu_pd(c_pointer); zb2 = _mm512_loadu_pd(c_pointer+LDC);\
|
||||
zc1 = _mm512_fmadd_pd(zc1,za1,zb1); zc2 = _mm512_fmadd_pd(zc2,za1,zb2);\
|
||||
_mm512_storeu_pd(c_pointer,zc1); _mm512_storeu_pd(c_pointer+LDC,zc2);\
|
||||
c_pointer += 8;\
|
||||
}
|
||||
#define SAVE_m8n4 {\
|
||||
__asm__ __volatile__("vbroadcastsd (%0),%1;":"+r"(alpha),"+v"(za1)::"memory");\
|
||||
zb1 = _mm512_loadu_pd(c_pointer); zb2 = _mm512_loadu_pd(c_pointer+LDC);\
|
||||
zc1 = _mm512_fmadd_pd(zc1,za1,zb1); zc2 = _mm512_fmadd_pd(zc2,za1,zb2);\
|
||||
_mm512_storeu_pd(c_pointer,zc1); _mm512_storeu_pd(c_pointer+LDC,zc2);\
|
||||
c_pointer += LDC*2;\
|
||||
zb1 = _mm512_loadu_pd(c_pointer); zb2 = _mm512_loadu_pd(c_pointer+LDC);\
|
||||
zc3 = _mm512_fmadd_pd(zc3,za1,zb1); zc4 = _mm512_fmadd_pd(zc4,za1,zb2);\
|
||||
_mm512_storeu_pd(c_pointer,zc3); _mm512_storeu_pd(c_pointer+LDC,zc4);\
|
||||
c_pointer += 8-LDC*2;\
|
||||
}
|
||||
#define KERNEL_m4n4k1 {\
|
||||
ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\
|
||||
yb1 = _mm256_broadcast_sd(b_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\
|
||||
yb2 = _mm256_broadcast_sd(b_block_pointer+1); yc2 = _mm256_fmadd_pd(ya1,yb2,yc2);\
|
||||
yb1 = _mm256_broadcast_sd(b_block_pointer+2); yc3 = _mm256_fmadd_pd(ya1,yb1,yc3);\
|
||||
yb2 = _mm256_broadcast_sd(b_block_pointer+3); yc4 = _mm256_fmadd_pd(ya1,yb2,yc4);\
|
||||
b_block_pointer+=4;\
|
||||
}
|
||||
#define KERNEL_m4n2k1 {\
|
||||
ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\
|
||||
yb1 = _mm256_broadcast_sd(b_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\
|
||||
yb2 = _mm256_broadcast_sd(b_block_pointer+1); yc2 = _mm256_fmadd_pd(ya1,yb2,yc2);\
|
||||
b_block_pointer+=2;\
|
||||
}
|
||||
#define KERNEL_m4n1k1 {\
|
||||
ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\
|
||||
yb1 = _mm256_broadcast_sd(b_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\
|
||||
b_block_pointer++;\
|
||||
}
|
||||
#define INIT_m4n1 yc1=_mm256_setzero_pd();
|
||||
#define INIT_m4n2 yc2=INIT_m4n1
|
||||
#define INIT_m4n4 yc4=yc3=INIT_m4n2
|
||||
#define SAVE_m4n1 {\
|
||||
yb1 = _mm256_broadcast_sd(alpha);\
|
||||
ya1 = _mm256_loadu_pd(c_pointer);\
|
||||
yc1 = _mm256_fmadd_pd(yc1,yb1,ya1);\
|
||||
_mm256_storeu_pd(c_pointer,yc1);\
|
||||
c_pointer += 4;\
|
||||
}
|
||||
#define SAVE_m4n2 {\
|
||||
ya1 = _mm256_broadcast_sd(alpha);\
|
||||
yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\
|
||||
yc1 = _mm256_fmadd_pd(yc1,ya1,yb1); yc2 = _mm256_fmadd_pd(yc2,ya1,yb2);\
|
||||
_mm256_storeu_pd(c_pointer,yc1); _mm256_storeu_pd(c_pointer+LDC,yc2);\
|
||||
c_pointer += 4;\
|
||||
}
|
||||
#define SAVE_m4n4 {\
|
||||
ya1 = _mm256_broadcast_sd(alpha);\
|
||||
yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\
|
||||
yc1 = _mm256_fmadd_pd(yc1,ya1,yb1); yc2 = _mm256_fmadd_pd(yc2,ya1,yb2);\
|
||||
_mm256_storeu_pd(c_pointer,yc1); _mm256_storeu_pd(c_pointer+LDC,yc2);\
|
||||
c_pointer += LDC*2;\
|
||||
yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\
|
||||
yc3 = _mm256_fmadd_pd(yc3,ya1,yb1); yc4 = _mm256_fmadd_pd(yc4,ya1,yb2);\
|
||||
_mm256_storeu_pd(c_pointer,yc3); _mm256_storeu_pd(c_pointer+LDC,yc4);\
|
||||
c_pointer += 4-LDC*2;\
|
||||
}
|
||||
#define KERNEL_m2n2k1 {\
|
||||
xa1 = _mm_loadu_pd(a_block_pointer); a_block_pointer+=2;\
|
||||
xb1 = _mm_loaddup_pd(b_block_pointer); xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\
|
||||
xb2 = _mm_loaddup_pd(b_block_pointer+1); xc2 = _mm_fmadd_pd(xa1,xb2,xc2);\
|
||||
b_block_pointer += 2;\
|
||||
}
|
||||
#define KERNEL_m2n1k1 {\
|
||||
xa1 = _mm_loadu_pd(a_block_pointer); a_block_pointer+=2;\
|
||||
xb1 = _mm_loaddup_pd(b_block_pointer); xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\
|
||||
b_block_pointer ++;\
|
||||
}
|
||||
#define INIT_m2n1 xc1=_mm_setzero_pd();
|
||||
#define INIT_m2n2 xc2=INIT_m2n1
|
||||
#define SAVE_m2n1 {\
|
||||
xb1 = _mm_loaddup_pd(alpha);\
|
||||
xa1 = _mm_loadu_pd(c_pointer);\
|
||||
xc1 = _mm_fmadd_pd(xc1,xb1,xa1);\
|
||||
_mm_storeu_pd(c_pointer,xc1);\
|
||||
c_pointer += 2;\
|
||||
}
|
||||
#define SAVE_m2n2 {\
|
||||
xa1 = _mm_loaddup_pd(alpha);\
|
||||
xb1 = _mm_loadu_pd(c_pointer); xb2 = _mm_loadu_pd(c_pointer+LDC);\
|
||||
xc1 = _mm_fmadd_pd(xc1,xa1,xb1); xc2 = _mm_fmadd_pd(xc2,xa1,xb2);\
|
||||
_mm_storeu_pd(c_pointer,xc1); _mm_storeu_pd(c_pointer+LDC,xc2);\
|
||||
c_pointer += 2;\
|
||||
}
|
||||
#define KERNEL_m1n1k1 {\
|
||||
sa1 = *a_block_pointer; a_block_pointer++;\
|
||||
sb1 = *b_block_pointer; sc1 += sa1 * sb1;\
|
||||
b_block_pointer ++;\
|
||||
}
|
||||
#define INIT_m1n1 sc1=0.0;
|
||||
#define SAVE_m1n1 {\
|
||||
*c_pointer += sc1 * (*alpha);\
|
||||
c_pointer++;\
|
||||
}
|
||||
|
||||
/* row-major c_block */
|
||||
#define KERNEL_m2n4k1 {\
|
||||
yb1 = _mm256_loadu_pd(b_block_pointer);b_block_pointer+=4;\
|
||||
ya1 = _mm256_broadcast_sd(a_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\
|
||||
ya1 = _mm256_broadcast_sd(a_block_pointer+1);yc2 = _mm256_fmadd_pd(ya1,yb1,yc2);\
|
||||
a_block_pointer += 2;\
|
||||
}
|
||||
#define KERNEL_m1n4k1 {\
|
||||
yb1 = _mm256_loadu_pd(b_block_pointer);b_block_pointer+=4;\
|
||||
ya1 = _mm256_broadcast_sd(a_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\
|
||||
a_block_pointer ++;\
|
||||
}
|
||||
#define KERNEL_m1n2k1 {\
|
||||
xb1 = _mm_loadu_pd(b_block_pointer);b_block_pointer+=2;\
|
||||
xa1 = _mm_loaddup_pd(a_block_pointer); xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\
|
||||
a_block_pointer ++;\
|
||||
}
|
||||
#define INIT_m1n2 INIT_m2n1
|
||||
#define INIT_m1n4 INIT_m4n1
|
||||
#define INIT_m2n4 INIT_m4n2
|
||||
#define SAVE_m2n4 {\
|
||||
ya1 = _mm256_broadcast_sd(alpha);\
|
||||
yc1 = _mm256_mul_pd(yc1,ya1);\
|
||||
yc2 = _mm256_mul_pd(yc2,ya1);\
|
||||
yb1 = _mm256_unpacklo_pd(yc1,yc2);\
|
||||
yb2 = _mm256_unpackhi_pd(yc1,yc2);\
|
||||
xb1 = _mm_add_pd(_mm_loadu_pd(c_pointer),_mm256_extractf128_pd(yb1,0));\
|
||||
xb2 = _mm_add_pd(_mm_loadu_pd(c_pointer+LDC),_mm256_extractf128_pd(yb2,0));\
|
||||
_mm_storeu_pd(c_pointer,xb1);\
|
||||
_mm_storeu_pd(c_pointer+LDC,xb2);\
|
||||
xb1 = _mm_add_pd(_mm_loadu_pd(c_pointer+2*LDC),_mm256_extractf128_pd(yb1,1));\
|
||||
xb2 = _mm_add_pd(_mm_loadu_pd(c_pointer+3*LDC),_mm256_extractf128_pd(yb2,1));\
|
||||
_mm_storeu_pd(c_pointer+2*LDC,xb1);\
|
||||
_mm_storeu_pd(c_pointer+3*LDC,xb2);\
|
||||
c_pointer += 2;\
|
||||
}
|
||||
#define SAVE_m1n2 {\
|
||||
xb1 = _mm_loaddup_pd(alpha);\
|
||||
xc1 = _mm_mul_pd(xc1,xb1);\
|
||||
*c_pointer += _mm_cvtsd_f64(xc1);\
|
||||
xa1 = _mm_unpackhi_pd(xc1,xc1);\
|
||||
c_pointer[LDC]+= _mm_cvtsd_f64(xa1);\
|
||||
c_pointer ++;\
|
||||
}
|
||||
#define SAVE_m1n4 {\
|
||||
ya1 = _mm256_broadcast_sd(alpha);\
|
||||
yc1 = _mm256_mul_pd(yc1,ya1);\
|
||||
xb1 = _mm256_extractf128_pd(yc1,0);\
|
||||
*c_pointer += _mm_cvtsd_f64(xb1);\
|
||||
xb2 = _mm_unpackhi_pd(xb1,xb1);\
|
||||
c_pointer[LDC] += _mm_cvtsd_f64(xb2);\
|
||||
xb1 = _mm256_extractf128_pd(yc1,1);\
|
||||
c_pointer[LDC*2] += _mm_cvtsd_f64(xb1);\
|
||||
xb2 = _mm_unpackhi_pd(xb1,xb1);\
|
||||
c_pointer[LDC*3] += _mm_cvtsd_f64(xb2);\
|
||||
c_pointer ++;\
|
||||
}
|
||||
|
||||
static void KERNEL_EDGE(double *packed_a, double *packed_b, BLASLONG m, BLASLONG edge_n, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=8,ocopy=8
|
||||
//perform C += A<pack> B<pack> , edge_n<8 must be satisfied.
|
||||
if(k==0 || m==0 || edge_n==0) return;
|
||||
double *a_block_pointer,*b_block_pointer,*b_base_pointer;
|
||||
double *c_pointer = c;
|
||||
__m512d zb1,zb2,za1,zc1,zc2,zc3,zc4;
|
||||
__m256d yc1,yc2,yc3,yc4,ya1,yb1,yb2;
|
||||
__m128d xc1,xc2,xa1,xb1,xb2;
|
||||
double sc1,sa1,sb1;
|
||||
BLASLONG m_count,n_count,k_count;
|
||||
b_base_pointer = packed_b;
|
||||
//now start calculation of the edge part
|
||||
for(n_count=edge_n;n_count>3;n_count-=4){
|
||||
a_block_pointer = packed_a;
|
||||
for(m_count=m;m_count>7;m_count-=8){
|
||||
b_block_pointer = b_base_pointer;
|
||||
INIT_m8n4
|
||||
for(k_count=0;k_count<k;k_count++) KERNEL_m8n4k1
|
||||
SAVE_m8n4
|
||||
}
|
||||
for(;m_count>3;m_count-=4){
|
||||
b_block_pointer = b_base_pointer;
|
||||
INIT_m4n4
|
||||
for(k_count=0;k_count<k;k_count++) KERNEL_m4n4k1
|
||||
SAVE_m4n4
|
||||
}
|
||||
for(;m_count>1;m_count-=2){
|
||||
b_block_pointer = b_base_pointer;
|
||||
INIT_m2n4
|
||||
for(k_count=0;k_count<k;k_count++) KERNEL_m2n4k1
|
||||
SAVE_m2n4
|
||||
}
|
||||
if(m_count>0){
|
||||
b_block_pointer = b_base_pointer;
|
||||
INIT_m1n4
|
||||
for(k_count=0;k_count<k;k_count++) KERNEL_m1n4k1
|
||||
SAVE_m1n4
|
||||
}
|
||||
b_base_pointer += 4*k;
|
||||
c_pointer += 4 * LDC - m;
|
||||
}
|
||||
for(;n_count>1;n_count-=2){
|
||||
a_block_pointer = packed_a;
|
||||
for(m_count=m;m_count>7;m_count-=8){
|
||||
b_block_pointer = b_base_pointer;
|
||||
INIT_m8n2
|
||||
for(k_count=0;k_count<k;k_count++) KERNEL_m8n2k1
|
||||
SAVE_m8n2
|
||||
}
|
||||
for(;m_count>3;m_count-=4){
|
||||
b_block_pointer = b_base_pointer;
|
||||
INIT_m4n2
|
||||
for(k_count=0;k_count<k;k_count++) KERNEL_m4n2k1
|
||||
SAVE_m4n2
|
||||
}
|
||||
for(;m_count>1;m_count-=2){
|
||||
b_block_pointer = b_base_pointer;
|
||||
INIT_m2n2
|
||||
for(k_count=0;k_count<k;k_count++) KERNEL_m2n2k1
|
||||
SAVE_m2n2
|
||||
}
|
||||
if(m_count>0){
|
||||
b_block_pointer = b_base_pointer;
|
||||
INIT_m1n2
|
||||
for(k_count=0;k_count<k;k_count++) KERNEL_m1n2k1
|
||||
SAVE_m1n2
|
||||
}
|
||||
b_base_pointer += 2*k;
|
||||
c_pointer += 2 * LDC - m;
|
||||
}
|
||||
if(n_count>0){
|
||||
a_block_pointer = packed_a;
|
||||
for(m_count=m;m_count>7;m_count-=8){
|
||||
b_block_pointer = b_base_pointer;
|
||||
INIT_m8n1
|
||||
for(k_count=0;k_count<k;k_count++) KERNEL_m8n1k1
|
||||
SAVE_m8n1
|
||||
}
|
||||
for(;m_count>3;m_count-=4){
|
||||
b_block_pointer = b_base_pointer;
|
||||
INIT_m4n1
|
||||
for(k_count=0;k_count<k;k_count++) KERNEL_m4n1k1
|
||||
SAVE_m4n1
|
||||
}
|
||||
for(;m_count>1;m_count-=2){
|
||||
b_block_pointer = b_base_pointer;
|
||||
INIT_m2n1
|
||||
for(k_count=0;k_count<k;k_count++) KERNEL_m2n1k1
|
||||
SAVE_m2n1
|
||||
}
|
||||
if(m_count>0){
|
||||
b_block_pointer = b_base_pointer;
|
||||
INIT_m1n1
|
||||
for(k_count=0;k_count<k;k_count++) KERNEL_m1n1k1
|
||||
SAVE_m1n1
|
||||
}
|
||||
}
|
||||
}
|
||||
#ifdef ICOPY_4
|
||||
static void copy_4_to_8(double *src,double *dst,BLASLONG m,BLASLONG k){
|
||||
BLASLONG m_count,k_count;double *src1,*dst1,*src2;__m256d tmp;
|
||||
src1 = src; dst1 = dst; src2 = src1 + 4 * k;
|
||||
for(m_count=m;m_count>7;m_count-=8){
|
||||
for(k_count=k;k_count>0;k_count--){
|
||||
tmp = _mm256_loadu_pd(src1);_mm256_storeu_pd(dst1+0,tmp);src1+=4;
|
||||
tmp = _mm256_loadu_pd(src2);_mm256_storeu_pd(dst1+4,tmp);src2+=4;
|
||||
dst1+=8;
|
||||
}
|
||||
src1+=4*k;src2+=4*k;
|
||||
}
|
||||
for(;m_count>0;m_count--){
|
||||
for(k_count=k;k_count>0;k_count--){
|
||||
*dst1 = (*src1); src1++; dst1++;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
int __attribute__ ((noinline)) CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, double * __restrict__ B, double * __restrict__ C, BLASLONG ldc){
|
||||
if(m==0 || n==0 || k==0 || alpha == 0.0) return 0;
|
||||
BLASLONG ndiv8 = n/8;double ALPHA = alpha;
|
||||
#ifdef ICOPY_4
|
||||
double *packed_a = (double *)malloc(m*k*sizeof(double));
|
||||
copy_4_to_8(A,packed_a,m,k);
|
||||
#else //ICOPY_8
|
||||
double *packed_a = A;
|
||||
#endif
|
||||
if(ndiv8>0) KERNEL_MAIN(packed_a,B,m,ndiv8,k,ldc,C,&ALPHA);
|
||||
if(n>ndiv8*8) KERNEL_EDGE(packed_a,B+(int64_t)k*(int64_t)ndiv8*8,m,n-ndiv8*8,k,ldc,C+(int64_t)ldc*(int64_t)ndiv8*8,&ALPHA);
|
||||
#ifdef ICOPY_4
|
||||
free(packed_a);packed_a=NULL;
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
@@ -762,7 +762,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
int __attribute__ ((noinline))
|
||||
CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict A, float * __restrict B, float * __restrict C, BLASLONG ldc)
|
||||
{
|
||||
unsigned long M = m, N = n, K = k;
|
||||
unsigned long long M = m, N = n, K = k;
|
||||
if (M == 0)
|
||||
return 0;
|
||||
if (N == 0)
|
||||
@@ -1215,7 +1215,7 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict A, flo
|
||||
|
||||
int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K)
|
||||
{
|
||||
int mnk = M * N * K;
|
||||
unsigned long long mnk = M * N * K;
|
||||
/* large matrixes -> not performant */
|
||||
if (mnk >= 28 * 512 * 512)
|
||||
return 0;
|
||||
@@ -1639,4 +1639,4 @@ void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict
|
||||
STORE_SCALAR(0, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
879
kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c
Normal file
879
kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c
Normal file
@@ -0,0 +1,879 @@
|
||||
/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store */
|
||||
/* r12 = k << 4(const), r13 = k(const), r14 = b_head_pos(const), r15 = %1 + 3r12 */
|
||||
|
||||
#include "common.h"
|
||||
#include <stdint.h>
|
||||
|
||||
/* m = 16 */ /* zmm8-zmm31 for accumulators, zmm1-zmm7 for temporary use, zmm0 for alpha */
|
||||
#define KERNEL_k1m16n1 \
|
||||
"vmovups (%0),%%zmm4; addq $64,%0;"\
|
||||
"vbroadcastss (%1),%%zmm6; vfmadd231ps %%zmm4,%%zmm6,%%zmm8;"\
|
||||
"addq $4,%1;"
|
||||
#define KERNEL_h_k1m16n2 \
|
||||
"vmovsldup (%0),%%zmm4; vmovshdup (%0),%%zmm5; prefetcht0 512(%0); addq $64,%0;"\
|
||||
"vbroadcastsd (%1),%%zmm6; vfmadd231ps %%zmm4,%%zmm6,%%zmm8; vfmadd231ps %%zmm5,%%zmm6,%%zmm9;"
|
||||
#define KERNEL_k1m16n2 KERNEL_h_k1m16n2 "addq $8,%1;"
|
||||
#define KERNEL_h_k1m16n4 KERNEL_h_k1m16n2 "vbroadcastsd 8(%1),%%zmm7; vfmadd231ps %%zmm4,%%zmm7,%%zmm10; vfmadd231ps %%zmm5,%%zmm7,%%zmm11;"
|
||||
#define KERNEL_k1m16n4 KERNEL_h_k1m16n4 "addq $16,%1;"
|
||||
#define unit_kernel_k1m16n4(c1,c2,c3,c4, ...) \
|
||||
"vbroadcastsd ("#__VA_ARGS__"),%%zmm6; vfmadd231ps %%zmm4,%%zmm6,"#c1"; vfmadd231ps %%zmm5,%%zmm6,"#c2";"\
|
||||
"vbroadcastsd 8("#__VA_ARGS__"),%%zmm7; vfmadd231ps %%zmm4,%%zmm7,"#c3"; vfmadd231ps %%zmm5,%%zmm7,"#c4";"
|
||||
#define KERNEL_h_k1m16n8 KERNEL_h_k1m16n4 unit_kernel_k1m16n4(%%zmm12,%%zmm13,%%zmm14,%%zmm15,%1,%%r12,1)
|
||||
#define KERNEL_k1m16n8 KERNEL_h_k1m16n8 "addq $16,%1;"
|
||||
#define KERNEL_h_k1m16n12 KERNEL_h_k1m16n8 unit_kernel_k1m16n4(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%1,%%r12,2)
|
||||
#define KERNEL_k1m16n12 KERNEL_h_k1m16n12 "addq $16,%1;"
|
||||
#define KERNEL_h_k1m16n16 KERNEL_k1m16n12 unit_kernel_k1m16n4(%%zmm20,%%zmm21,%%zmm22,%%zmm23,%%r15)
|
||||
#define KERNEL_k1m16n16 KERNEL_h_k1m16n16 "addq $16,%%r15;"
|
||||
#define KERNEL_h_k1m16n20 KERNEL_h_k1m16n16 unit_kernel_k1m16n4(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%r15,%%r12,1)
|
||||
#define KERNEL_k1m16n20 KERNEL_h_k1m16n20 "addq $16,%%r15;"
|
||||
#define KERNEL_h_k1m16n24 KERNEL_h_k1m16n20 unit_kernel_k1m16n4(%%zmm28,%%zmm29,%%zmm30,%%zmm31,%%r15,%%r12,2)
|
||||
#define KERNEL_k1m16n24 KERNEL_h_k1m16n24 "addq $16,%%r15;"
|
||||
#define INIT_m16n1 "vpxorq %%zmm8,%%zmm8,%%zmm8;"
|
||||
#define INIT_m16n2 INIT_m16n1 "vpxorq %%zmm9,%%zmm9,%%zmm9;"
|
||||
#define INIT_m16n4 INIT_m16n2 "vpxorq %%zmm10,%%zmm10,%%zmm10;vpxorq %%zmm11,%%zmm11,%%zmm11;"
|
||||
#define unit_init_m16n4(c1,c2,c3,c4) \
|
||||
"vpxorq "#c1","#c1","#c1";vpxorq "#c2","#c2","#c2";vpxorq "#c3","#c3","#c3";vpxorq "#c4","#c4","#c4";"
|
||||
#define INIT_m16n8 INIT_m16n4 unit_init_m16n4(%%zmm12,%%zmm13,%%zmm14,%%zmm15)
|
||||
#define INIT_m16n12 INIT_m16n8 unit_init_m16n4(%%zmm16,%%zmm17,%%zmm18,%%zmm19)
|
||||
#define INIT_m16n16 INIT_m16n12 unit_init_m16n4(%%zmm20,%%zmm21,%%zmm22,%%zmm23)
|
||||
#define INIT_m16n20 INIT_m16n16 unit_init_m16n4(%%zmm24,%%zmm25,%%zmm26,%%zmm27)
|
||||
#define INIT_m16n24 INIT_m16n20 unit_init_m16n4(%%zmm28,%%zmm29,%%zmm30,%%zmm31)
|
||||
#define SAVE_h_m16n1 "vfmadd213ps (%2),%%zmm0,%%zmm8; vmovups %%zmm8,(%2);"
|
||||
#define unit_save_m16n2(c1,c2) \
|
||||
"vunpcklps "#c2","#c1",%%zmm6; vunpckhps "#c2","#c1",%%zmm7; vunpcklpd %%zmm7,%%zmm6,%%zmm4; vunpckhpd %%zmm7,%%zmm6,%%zmm5;"\
|
||||
"vfmadd213ps (%5),%%zmm0,%%zmm4; vfmadd213ps (%5,%3,1),%%zmm0,%%zmm5;"\
|
||||
"vmovups %%zmm4,(%5); vmovups %%zmm5,(%5,%3,1); leaq (%5,%3,2),%5;"
|
||||
#define SAVE_h_m16n2 "movq %2,%5;" unit_save_m16n2(%%zmm8,%%zmm9)
|
||||
#define SAVE_h_m16n4 SAVE_h_m16n2 unit_save_m16n2(%%zmm10,%%zmm11)
|
||||
#define SAVE_h_m16n8 SAVE_h_m16n4 unit_save_m16n2(%%zmm12,%%zmm13) unit_save_m16n2(%%zmm14,%%zmm15)
|
||||
#define SAVE_h_m16n12 SAVE_h_m16n8 unit_save_m16n2(%%zmm16,%%zmm17) unit_save_m16n2(%%zmm18,%%zmm19)
|
||||
#define SAVE_h_m16n16 SAVE_h_m16n12 unit_save_m16n2(%%zmm20,%%zmm21) unit_save_m16n2(%%zmm22,%%zmm23)
|
||||
#define SAVE_h_m16n20 SAVE_h_m16n16 unit_save_m16n2(%%zmm24,%%zmm25) unit_save_m16n2(%%zmm26,%%zmm27)
|
||||
#define SAVE_h_m16n24 SAVE_h_m16n20 unit_save_m16n2(%%zmm28,%%zmm29) unit_save_m16n2(%%zmm30,%%zmm31)
|
||||
#define SAVE_m16(ndim) SAVE_h_m16n##ndim "addq $64,%2;"
|
||||
#define COMPUTE_m16(ndim) \
|
||||
INIT_m16n##ndim\
|
||||
"movq %%r13,%4; movq %%r14,%1; leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15; movq %2,%5;"\
|
||||
"cmpq $18,%4; jb "#ndim"016162f;"\
|
||||
#ndim"016161:\n\t"\
|
||||
KERNEL_k1m16n##ndim\
|
||||
KERNEL_k1m16n##ndim\
|
||||
KERNEL_k1m16n##ndim\
|
||||
"prefetcht1 (%5); prefetcht1 63(%5); addq %3,%5;"\
|
||||
KERNEL_k1m16n##ndim\
|
||||
KERNEL_k1m16n##ndim\
|
||||
KERNEL_k1m16n##ndim\
|
||||
"prefetcht1 (%8); addq $32,%8;"\
|
||||
"subq $6,%4; cmpq $18,%4; jnb "#ndim"016161b;"\
|
||||
"movq %2,%5;"\
|
||||
#ndim"016162:\n\t"\
|
||||
"testq %4,%4; jz "#ndim"016163f;"\
|
||||
"prefetcht0 (%5); prefetcht0 63(%5); prefetcht0 (%5,%3,1); prefetcht0 63(%5,%3,1);"\
|
||||
KERNEL_k1m16n##ndim\
|
||||
"leaq (%5,%3,2),%5;"\
|
||||
"decq %4; jmp "#ndim"016162b;"\
|
||||
#ndim"016163:\n\t"\
|
||||
"prefetcht0 (%%r14); prefetcht0 64(%%r14);"\
|
||||
SAVE_m16(ndim)
|
||||
|
||||
/* m = 8 *//* ymm0 for alpha, ymm1-ymm3 for temporary use, ymm4-ymm15 for accumulators */
|
||||
#define KERNEL_k1m8n1(b_addr) \
|
||||
"vmovups (%0),%%ymm1; addq $32,%0;"\
|
||||
"vbroadcastss ("#b_addr"),%%ymm2; vfmadd231ps %%ymm1,%%ymm2,%%ymm4;"\
|
||||
"addq $4,"#b_addr";"
|
||||
#define KERNEL_h_k1m8n2(b_addr) \
|
||||
"vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; addq $32,%0;"\
|
||||
"vbroadcastsd ("#b_addr"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"
|
||||
#define KERNEL_k1m8n2(b_addr) KERNEL_h_k1m8n2(b_addr) "addq $8,"#b_addr";"
|
||||
#define KERNEL_h_k1m8n4(b_addr) \
|
||||
KERNEL_h_k1m8n2(b_addr) "vbroadcastsd 8("#b_addr"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"
|
||||
#define KERNEL_k1m8n4(b_addr) KERNEL_h_k1m8n4(b_addr) "addq $16,"#b_addr";"
|
||||
#define unit_kernel_k1m8n4(c1,c2,c3,c4,...) \
|
||||
"vbroadcastsd ("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c1"; vfmadd231ps %%ymm2,%%ymm3,"#c2";"\
|
||||
"vbroadcastsd 8("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c3"; vfmadd231ps %%ymm2,%%ymm3,"#c4";"
|
||||
#define KERNEL_h_k1m8n8(b_addr) KERNEL_h_k1m8n4(b_addr) unit_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,b_addr,%%r12,1)
|
||||
#define KERNEL_k1m8n8(b_addr) KERNEL_h_k1m8n8(b_addr) "addq $16,"#b_addr";"
|
||||
#define KERNEL_h_k1m8n12(b_addr) KERNEL_h_k1m8n8(b_addr) unit_kernel_k1m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,b_addr,%%r12,2)
|
||||
#define KERNEL_k1m8n12(b_addr) KERNEL_h_k1m8n12(b_addr) "addq $16,"#b_addr";"
|
||||
#define INIT_m8n1 "vpxor %%ymm4,%%ymm4,%%ymm4;"
|
||||
#define INIT_m8n2 INIT_m8n1 "vpxor %%ymm5,%%ymm5,%%ymm5;"
|
||||
#define INIT_m8n4 INIT_m8n2 "vpxor %%ymm6,%%ymm6,%%ymm6;vpxor %%ymm7,%%ymm7,%%ymm7;"
|
||||
#define unit_init_m8n4(c1,c2,c3,c4) \
|
||||
"vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";"
|
||||
#define INIT_m8n8 INIT_m8n4 unit_init_m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11)
|
||||
#define INIT_m8n12 INIT_m8n8 unit_init_m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15)
|
||||
#define SAVE_L_m8n1 "vfmadd213ps (%2),%%ymm0,%%ymm4; vmovups %%ymm4,(%2);"
|
||||
#define unit_save_m8n2(c1,c2) \
|
||||
"vunpcklps "#c2","#c1",%%ymm2; vunpckhps "#c2","#c1",%%ymm3;"\
|
||||
"vunpcklpd %%ymm3,%%ymm2,%%ymm1;vfmadd213ps (%5), %%ymm0,%%ymm1;vmovups %%ymm1,(%5);"\
|
||||
"vunpckhpd %%ymm3,%%ymm2,%%ymm1;vfmadd213ps (%5,%3,1),%%ymm0,%%ymm1;vmovups %%ymm1,(%5,%3,1);"\
|
||||
"leaq (%5,%3,2),%5;"
|
||||
#define SAVE_L_m8n2 "movq %2,%5;" unit_save_m8n2(%%ymm4,%%ymm5)
|
||||
#define SAVE_L_m8n4 SAVE_L_m8n2 unit_save_m8n2(%%ymm6,%%ymm7)
|
||||
#define SAVE_L_m8n8 SAVE_L_m8n4 unit_save_m8n2(%%ymm8,%%ymm9) unit_save_m8n2(%%ymm10,%%ymm11)
|
||||
#define SAVE_L_m8n12 SAVE_L_m8n8 unit_save_m8n2(%%ymm12,%%ymm13) unit_save_m8n2(%%ymm14,%%ymm15)
|
||||
#define SAVE_R_m8n4 unit_save_m8n2(%%ymm4,%%ymm5) unit_save_m8n2(%%ymm6,%%ymm7)
|
||||
#define SAVE_R_m8n8 SAVE_R_m8n4 unit_save_m8n2(%%ymm8,%%ymm9) unit_save_m8n2(%%ymm10,%%ymm11)
|
||||
#define SAVE_R_m8n12 SAVE_R_m8n8 unit_save_m8n2(%%ymm12,%%ymm13) unit_save_m8n2(%%ymm14,%%ymm15)
|
||||
#define COMPUTE_L_m8(ndim,sim) \
|
||||
INIT_m8n##ndim\
|
||||
"movq %%r13,%4; movq %%r14,%1;"\
|
||||
#ndim""#sim"882:\n\t"\
|
||||
"testq %4,%4; jz "#ndim""#sim"883f;"\
|
||||
KERNEL_k1m8n##ndim(%1)\
|
||||
"decq %4; jmp "#ndim""#sim"882b;"\
|
||||
#ndim""#sim"883:\n\t"\
|
||||
SAVE_L_m8n##ndim "addq $32,%2;"
|
||||
#define COMPUTE_R_m8(ndim,sim) \
|
||||
"subq %%r12,%0; subq %%r12,%0;"\
|
||||
INIT_m8n##ndim\
|
||||
"movq %%r13,%4; leaq (%%r14,%%r12,2),%%r15; addq %%r12,%%r15;"\
|
||||
#ndim""#sim"882:\n\t"\
|
||||
"testq %4,%4; jz "#ndim""#sim"883f;"\
|
||||
KERNEL_k1m8n##ndim(%%r15)\
|
||||
"decq %4; jmp "#ndim""#sim"882b;"\
|
||||
#ndim""#sim"883:\n\t"\
|
||||
SAVE_R_m8n##ndim
|
||||
#define COMPUTE_m8_n1 COMPUTE_L_m8(1,33833)
|
||||
#define COMPUTE_m8_n2 COMPUTE_L_m8(2,33833)
|
||||
#define COMPUTE_m8_n4 COMPUTE_L_m8(4,33833)
|
||||
#define COMPUTE_m8_n8 COMPUTE_L_m8(8,33833)
|
||||
#define COMPUTE_m8_n12 COMPUTE_L_m8(12,33833)
|
||||
#define COMPUTE_m8_n16 COMPUTE_L_m8(12,33733) COMPUTE_R_m8(4,33933)
|
||||
#define COMPUTE_m8_n20 COMPUTE_L_m8(12,33633) COMPUTE_R_m8(8,33933)
|
||||
#define COMPUTE_m8_n24 COMPUTE_L_m8(12,33533) COMPUTE_R_m8(12,33933)
|
||||
#define COMPUTE_m8(ndim) COMPUTE_m8_n##ndim
|
||||
|
||||
/* m = 4 *//* xmm0 for alpha, xmm1-xmm3 for temporary use, xmm4-xmm15 for accumulators */
|
||||
#define KERNEL_k1m4n1(b_addr) \
|
||||
"vmovups (%0),%%xmm1; addq $16,%0;"\
|
||||
"vbroadcastss ("#b_addr"),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\
|
||||
"addq $4,"#b_addr";"
|
||||
#define KERNEL_h_k1m4n2(b_addr) \
|
||||
"vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2; addq $16,%0;"\
|
||||
"vmovddup ("#b_addr"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm4; vfmadd231ps %%xmm2,%%xmm3,%%xmm5;"
|
||||
#define KERNEL_k1m4n2(b_addr) KERNEL_h_k1m4n2(b_addr) "addq $8,"#b_addr";"
|
||||
#define KERNEL_h_k1m4n4(b_addr) \
|
||||
KERNEL_h_k1m4n2(b_addr) "vmovddup 8("#b_addr"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm6; vfmadd231ps %%xmm2,%%xmm3,%%xmm7;"
|
||||
#define KERNEL_k1m4n4(b_addr) KERNEL_h_k1m4n4(b_addr) "addq $16,"#b_addr";"
|
||||
#define unit_kernel_k1m4n4(c1,c2,c3,c4,...) \
|
||||
"vmovddup ("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c1"; vfmadd231ps %%xmm2,%%xmm3,"#c2";"\
|
||||
"vmovddup 8("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c3"; vfmadd231ps %%xmm2,%%xmm3,"#c4";"
|
||||
#define KERNEL_h_k1m4n8(b_addr) KERNEL_h_k1m4n4(b_addr) unit_kernel_k1m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11,b_addr,%%r12,1)
|
||||
#define KERNEL_k1m4n8(b_addr) KERNEL_h_k1m4n8(b_addr) "addq $16,"#b_addr";"
|
||||
#define KERNEL_h_k1m4n12(b_addr) KERNEL_h_k1m4n8(b_addr) unit_kernel_k1m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15,b_addr,%%r12,2)
|
||||
#define KERNEL_k1m4n12(b_addr) KERNEL_h_k1m4n12(b_addr) "addq $16,"#b_addr";"
|
||||
#define INIT_m4n1 "vpxor %%xmm4,%%xmm4,%%xmm4;"
|
||||
#define INIT_m4n2 INIT_m4n1 "vpxor %%xmm5,%%xmm5,%%xmm5;"
|
||||
#define INIT_m4n4 INIT_m4n2 "vpxor %%xmm6,%%xmm6,%%xmm6;vpxor %%xmm7,%%xmm7,%%xmm7;"
|
||||
#define unit_init_m4n4(c1,c2,c3,c4) \
|
||||
"vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";"
|
||||
#define INIT_m4n8 INIT_m4n4 unit_init_m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11)
|
||||
#define INIT_m4n12 INIT_m4n8 unit_init_m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15)
|
||||
#define SAVE_L_m4n1 "vfmadd213ps (%2),%%xmm0,%%xmm4; vmovups %%xmm4,(%2);"
|
||||
#define unit_save_m4n2(c1,c2) \
|
||||
"vunpcklps "#c2","#c1",%%xmm2; vunpckhps "#c2","#c1",%%xmm3;"\
|
||||
"vunpcklpd %%xmm3,%%xmm2,%%xmm1;vfmadd213ps (%5), %%xmm0,%%xmm1;vmovups %%xmm1,(%5);"\
|
||||
"vunpckhpd %%xmm3,%%xmm2,%%xmm1;vfmadd213ps (%5,%3,1),%%xmm0,%%xmm1;vmovups %%xmm1,(%5,%3,1);"\
|
||||
"leaq (%5,%3,2),%5;"
|
||||
#define SAVE_L_m4n2 "movq %2,%5;" unit_save_m4n2(%%xmm4,%%xmm5)
|
||||
#define SAVE_L_m4n4 SAVE_L_m4n2 unit_save_m4n2(%%xmm6,%%xmm7)
|
||||
#define SAVE_L_m4n8 SAVE_L_m4n4 unit_save_m4n2(%%xmm8,%%xmm9) unit_save_m4n2(%%xmm10,%%xmm11)
|
||||
#define SAVE_L_m4n12 SAVE_L_m4n8 unit_save_m4n2(%%xmm12,%%xmm13) unit_save_m4n2(%%xmm14,%%xmm15)
|
||||
#define SAVE_R_m4n4 unit_save_m4n2(%%xmm4,%%xmm5) unit_save_m4n2(%%xmm6,%%xmm7)
|
||||
#define SAVE_R_m4n8 SAVE_R_m4n4 unit_save_m4n2(%%xmm8,%%xmm9) unit_save_m4n2(%%xmm10,%%xmm11)
|
||||
#define SAVE_R_m4n12 SAVE_R_m4n8 unit_save_m4n2(%%xmm12,%%xmm13) unit_save_m4n2(%%xmm14,%%xmm15)
|
||||
#define COMPUTE_L_m4(ndim,sim) \
|
||||
INIT_m4n##ndim\
|
||||
"movq %%r13,%4; movq %%r14,%1;"\
|
||||
#ndim""#sim"442:\n\t"\
|
||||
"testq %4,%4; jz "#ndim""#sim"443f;"\
|
||||
KERNEL_k1m4n##ndim(%1)\
|
||||
"decq %4; jmp "#ndim""#sim"442b;"\
|
||||
#ndim""#sim"443:\n\t"\
|
||||
SAVE_L_m4n##ndim "addq $16,%2;"
|
||||
#define COMPUTE_R_m4(ndim,sim) \
|
||||
"subq %%r12,%0;"\
|
||||
INIT_m4n##ndim\
|
||||
"movq %%r13,%4; leaq (%%r14,%%r12,2),%%r15; addq %%r12,%%r15;"\
|
||||
#ndim""#sim"442:\n\t"\
|
||||
"testq %4,%4; jz "#ndim""#sim"443f;"\
|
||||
KERNEL_k1m4n##ndim(%%r15)\
|
||||
"decq %4; jmp "#ndim""#sim"442b;"\
|
||||
#ndim""#sim"443:\n\t"\
|
||||
SAVE_R_m4n##ndim
|
||||
#define COMPUTE_m4_n1 COMPUTE_L_m4(1,55855)
|
||||
#define COMPUTE_m4_n2 COMPUTE_L_m4(2,55855)
|
||||
#define COMPUTE_m4_n4 COMPUTE_L_m4(4,55855)
|
||||
#define COMPUTE_m4_n8 COMPUTE_L_m4(8,55855)
|
||||
#define COMPUTE_m4_n12 COMPUTE_L_m4(12,55855)
|
||||
#define COMPUTE_m4_n16 COMPUTE_L_m4(12,55755) COMPUTE_R_m4(4,55955)
|
||||
#define COMPUTE_m4_n20 COMPUTE_L_m4(12,55655) COMPUTE_R_m4(8,55955)
|
||||
#define COMPUTE_m4_n24 COMPUTE_L_m4(12,55555) COMPUTE_R_m4(12,55955)
|
||||
#define COMPUTE_m4(ndim) COMPUTE_m4_n##ndim
|
||||
|
||||
/* m = 2 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm9 for accumulators */
|
||||
#define INIT_m2n1 "vpxor %%xmm4,%%xmm4,%%xmm4;"
|
||||
#define KERNEL_k1m2n1(b_addr) \
|
||||
"vmovsd (%0),%%xmm1; addq $8,%0;"\
|
||||
"vbroadcastss ("#b_addr"),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\
|
||||
"addq $4,"#b_addr";"
|
||||
#define SAVE_L_m2n1 "vmovsd (%2),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm4; vmovsd %%xmm4,(%2);"
|
||||
#define INIT_m2n2 INIT_m2n1 "vpxor %%xmm5,%%xmm5,%%xmm5;"
|
||||
#define KERNEL_k1m2n2(b_addr) \
|
||||
"vmovsd (%0),%%xmm1; addq $8,%0;"\
|
||||
"vbroadcastss ("#b_addr"),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\
|
||||
"vbroadcastss 4("#b_addr"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm5;"\
|
||||
"addq $8,"#b_addr";"
|
||||
#define SAVE_L_m2n2 SAVE_L_m2n1 "vmovsd (%2,%3,1),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm5; vmovsd %%xmm5,(%2,%3,1);"
|
||||
#define INIT_m2n4 INIT_m2n2
|
||||
#define INIT_m2n8 INIT_m2n4 "vpxor %%xmm6,%%xmm6,%%xmm6; vpxor %%xmm7,%%xmm7,%%xmm7;"
|
||||
#define INIT_m2n12 INIT_m2n8 "vpxor %%xmm8,%%xmm8,%%xmm8; vpxor %%xmm9,%%xmm9,%%xmm9;"
|
||||
#define KERNEL_k1m2n4(b_addr) \
|
||||
"vmovups ("#b_addr"),%%xmm3; addq $16,"#b_addr";"\
|
||||
"vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\
|
||||
"vbroadcastss 4(%0),%%xmm2; vfmadd231ps %%xmm3,%%xmm2,%%xmm5;"\
|
||||
"addq $8,%0;"
|
||||
#define KERNEL_k1m2n8(b_addr) \
|
||||
"vmovups ("#b_addr"),%%xmm3; vmovups ("#b_addr",%%r12,1),%%xmm2; addq $16,"#b_addr";"\
|
||||
"vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4; vfmadd231ps %%xmm2,%%xmm1,%%xmm6;"\
|
||||
"vbroadcastss 4(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm5; vfmadd231ps %%xmm2,%%xmm1,%%xmm7;"\
|
||||
"addq $8,%0;"
|
||||
#define KERNEL_k1m2n12(b_addr) \
|
||||
"vmovups ("#b_addr"),%%xmm3; vmovups ("#b_addr",%%r12,1),%%xmm2; vmovups ("#b_addr",%%r12,2),%%xmm1; addq $16,"#b_addr";"\
|
||||
"vbroadcastss (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm6; vfmadd231ps %%xmm1,%%xmm10,%%xmm8;"\
|
||||
"vbroadcastss 4(%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm5; vfmadd231ps %%xmm2,%%xmm10,%%xmm7; vfmadd231ps %%xmm1,%%xmm10,%%xmm9;"\
|
||||
"addq $8,%0;"
|
||||
#define unit_save_m2n4(c1,c2) \
|
||||
"vunpcklps "#c2","#c1",%%xmm1; vunpckhps "#c2","#c1",%%xmm2;"\
|
||||
"vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm1; vmovsd %%xmm1,(%5); vmovhpd %%xmm1,(%5,%3,1);"\
|
||||
"leaq (%5,%3,2),%5;"\
|
||||
"vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm2; vmovsd %%xmm2,(%5); vmovhpd %%xmm2,(%5,%3,1);"\
|
||||
"leaq (%5,%3,2),%5;"
|
||||
#define SAVE_L_m2n4 "movq %2,%5;" unit_save_m2n4(%%xmm4,%%xmm5)
|
||||
#define SAVE_L_m2n8 SAVE_L_m2n4 unit_save_m2n4(%%xmm6,%%xmm7)
|
||||
#define SAVE_L_m2n12 SAVE_L_m2n8 unit_save_m2n4(%%xmm8,%%xmm9)
|
||||
#define SAVE_R_m2n4 unit_save_m2n4(%%xmm4,%%xmm5)
|
||||
#define SAVE_R_m2n8 SAVE_R_m2n4 unit_save_m2n4(%%xmm6,%%xmm7)
|
||||
#define SAVE_R_m2n12 SAVE_R_m2n8 unit_save_m2n4(%%xmm8,%%xmm9)
|
||||
#define COMPUTE_L_m2(ndim,sim) \
|
||||
INIT_m2n##ndim\
|
||||
"movq %%r13,%4; movq %%r14,%1;"\
|
||||
#ndim""#sim"222:\n\t"\
|
||||
"testq %4,%4; jz "#ndim""#sim"223f;"\
|
||||
KERNEL_k1m2n##ndim(%1)\
|
||||
"decq %4; jmp "#ndim""#sim"222b;"\
|
||||
#ndim""#sim"223:\n\t"\
|
||||
SAVE_L_m2n##ndim "addq $8,%2;"
|
||||
#define COMPUTE_R_m2(ndim,sim) \
|
||||
"salq $3,%%r13;subq %%r13,%0;sarq $3,%%r13;"\
|
||||
INIT_m2n##ndim\
|
||||
"movq %%r13,%4; leaq (%%r14,%%r12,2),%%r15; addq %%r12,%%r15;"\
|
||||
#ndim""#sim"222:\n\t"\
|
||||
"testq %4,%4; jz "#ndim""#sim"223f;"\
|
||||
KERNEL_k1m2n##ndim(%%r15)\
|
||||
"decq %4; jmp "#ndim""#sim"222b;"\
|
||||
#ndim""#sim"223:\n\t"\
|
||||
SAVE_R_m2n##ndim
|
||||
#define COMPUTE_m2_n1 COMPUTE_L_m2(1,77877)
|
||||
#define COMPUTE_m2_n2 COMPUTE_L_m2(2,77877)
|
||||
#define COMPUTE_m2_n4 COMPUTE_L_m2(4,77877)
|
||||
#define COMPUTE_m2_n8 COMPUTE_L_m2(8,77877)
|
||||
#define COMPUTE_m2_n12 COMPUTE_L_m2(12,77877)
|
||||
#define COMPUTE_m2_n16 COMPUTE_L_m2(12,77777) COMPUTE_R_m2(4,77977)
|
||||
#define COMPUTE_m2_n20 COMPUTE_L_m2(12,77677) COMPUTE_R_m2(8,77977)
|
||||
#define COMPUTE_m2_n24 COMPUTE_L_m2(12,77577) COMPUTE_R_m2(12,77977)
|
||||
#define COMPUTE_m2(ndim) COMPUTE_m2_n##ndim
|
||||
|
||||
/* m = 1 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm6 for accumulators */
|
||||
#define INIT_m1n1 "vpxor %%xmm4,%%xmm4,%%xmm4;"
|
||||
#define KERNEL_k1m1n1(b_addr) \
|
||||
"vmovss ("#b_addr"),%%xmm3; addq $4,"#b_addr";"\
|
||||
"vmovss (%0),%%xmm1; vfmadd231ss %%xmm3,%%xmm1,%%xmm4;"\
|
||||
"addq $4,%0;"
|
||||
#define SAVE_L_m1n1 "vfmadd213ss (%2),%%xmm0,%%xmm4; vmovss %%xmm4,(%2);"
|
||||
#define INIT_m1n2 INIT_m1n1
|
||||
#define KERNEL_k1m1n2(b_addr) \
|
||||
"vmovsd ("#b_addr"),%%xmm3; addq $8,"#b_addr";"\
|
||||
"vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\
|
||||
"addq $4,%0;"
|
||||
#define SAVE_L_m1n2 \
|
||||
"vmovss (%2),%%xmm3; vinsertps $16,(%2,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm4;"\
|
||||
"vmovss %%xmm4,(%2); vextractps $1,%%xmm4,(%2,%3,1);"
|
||||
#define INIT_m1n4 INIT_m1n2
|
||||
#define INIT_m1n8 INIT_m1n4 "vpxor %%xmm5,%%xmm5,%%xmm5;"
|
||||
#define INIT_m1n12 INIT_m1n8 "vpxor %%xmm6,%%xmm6,%%xmm6;"
|
||||
#define KERNEL_k1m1n4(b_addr) \
|
||||
"vmovups ("#b_addr"),%%xmm3; addq $16,"#b_addr";"\
|
||||
"vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\
|
||||
"addq $4,%0;"
|
||||
#define KERNEL_k1m1n8(b_addr) \
|
||||
"vmovups ("#b_addr"),%%xmm3; vmovups ("#b_addr",%%r12,1),%%xmm2; addq $16,"#b_addr";"\
|
||||
"vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4; vfmadd231ps %%xmm2,%%xmm1,%%xmm5;"\
|
||||
"addq $4,%0;"
|
||||
#define KERNEL_k1m1n12(b_addr) \
|
||||
"vmovups ("#b_addr"),%%xmm3; vmovups ("#b_addr",%%r12,1),%%xmm2; vmovups ("#b_addr",%%r12,2),%%xmm1; addq $16,"#b_addr";"\
|
||||
"vbroadcastss (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm5; vfmadd231ps %%xmm1,%%xmm10,%%xmm6;"\
|
||||
"addq $4,%0;"
|
||||
#define unit_save_m1n4(c1) \
|
||||
"vpxor %%xmm10,%%xmm10,%%xmm10; vmovsd "#c1",%%xmm10,%%xmm2; vmovhlps "#c1",%%xmm10,%%xmm1;"\
|
||||
"vmovss (%5),%%xmm3; vinsertps $16,(%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm2;"\
|
||||
"vmovss %%xmm2,(%5); vextractps $1,%%xmm2,(%5,%3,1); leaq (%5,%3,2),%5;"\
|
||||
"vmovss (%5),%%xmm3; vinsertps $16,(%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm1;"\
|
||||
"vmovss %%xmm1,(%5); vextractps $1,%%xmm1,(%5,%3,1); leaq (%5,%3,2),%5;"
|
||||
#define SAVE_L_m1n4 "movq %2,%5;" unit_save_m1n4(%%xmm4)
|
||||
#define SAVE_L_m1n8 SAVE_L_m1n4 unit_save_m1n4(%%xmm5)
|
||||
#define SAVE_L_m1n12 SAVE_L_m1n8 unit_save_m1n4(%%xmm6)
|
||||
#define SAVE_R_m1n4 unit_save_m1n4(%%xmm4)
|
||||
#define SAVE_R_m1n8 SAVE_R_m1n4 unit_save_m1n4(%%xmm5)
|
||||
#define SAVE_R_m1n12 SAVE_R_m1n8 unit_save_m1n4(%%xmm6)
|
||||
#define COMPUTE_L_m1(ndim,sim) \
|
||||
INIT_m1n##ndim\
|
||||
"movq %%r13,%4; movq %%r14,%1;"\
|
||||
#ndim""#sim"112:\n\t"\
|
||||
"testq %4,%4; jz "#ndim""#sim"113f;"\
|
||||
KERNEL_k1m1n##ndim(%1)\
|
||||
"decq %4; jmp "#ndim""#sim"112b;"\
|
||||
#ndim""#sim"113:\n\t"\
|
||||
SAVE_L_m1n##ndim "addq $4,%2;"
|
||||
#define COMPUTE_R_m1(ndim,sim) \
|
||||
"salq $2,%%r13;subq %%r13,%0;sarq $2,%%r13;"\
|
||||
INIT_m1n##ndim\
|
||||
"movq %%r13,%4; leaq (%%r14,%%r12,2),%%r15; addq %%r12,%%r15;"\
|
||||
#ndim""#sim"112:\n\t"\
|
||||
"testq %4,%4; jz "#ndim""#sim"113f;"\
|
||||
KERNEL_k1m1n##ndim(%%r15)\
|
||||
"decq %4; jmp "#ndim""#sim"112b;"\
|
||||
#ndim""#sim"113:\n\t"\
|
||||
SAVE_R_m1n##ndim
|
||||
#define COMPUTE_m1_n1 COMPUTE_L_m1(1,99899)
|
||||
#define COMPUTE_m1_n2 COMPUTE_L_m1(2,99899)
|
||||
#define COMPUTE_m1_n4 COMPUTE_L_m1(4,99899)
|
||||
#define COMPUTE_m1_n8 COMPUTE_L_m1(8,99899)
|
||||
#define COMPUTE_m1_n12 COMPUTE_L_m1(12,99899)
|
||||
#define COMPUTE_m1_n16 COMPUTE_L_m1(12,99799) COMPUTE_R_m1(4,99999)
|
||||
#define COMPUTE_m1_n20 COMPUTE_L_m1(12,99699) COMPUTE_R_m1(8,99999)
|
||||
#define COMPUTE_m1_n24 COMPUTE_L_m1(12,99599) COMPUTE_R_m1(12,99999)
|
||||
#define COMPUTE_m1(ndim) COMPUTE_m1_n##ndim
|
||||
|
||||
/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 = "+r"(K), %5 = "+r"(ctemp) */
|
||||
/* %6 = "+r"(&alpha), %7 = "+r"(M), %8 = "+r"(next_b) */
|
||||
/* r11 = m(const), r12 = k << 4(const), r13 = k(const), r14 = b_head_pos(const), r15 = %1 + 3r12 */
|
||||
|
||||
#define COMPUTE(ndim) {\
|
||||
next_b = b_pointer + ndim * K;\
|
||||
__asm__ __volatile__(\
|
||||
"vbroadcastss (%6),%%zmm0;"\
|
||||
"movq %4,%%r13; movq %4,%%r12; salq $4,%%r12; movq %1,%%r14; movq %7,%%r11;"\
|
||||
"cmpq $16,%7;jb 33101"#ndim"f;"\
|
||||
"33109"#ndim":\n\t"\
|
||||
COMPUTE_m16(ndim)\
|
||||
"subq $16,%7;cmpq $16,%7;jnb 33109"#ndim"b;"\
|
||||
"33101"#ndim":\n\t"\
|
||||
"cmpq $8,%7;jb 33102"#ndim"f;"\
|
||||
COMPUTE_m8(ndim)\
|
||||
"subq $8,%7;"\
|
||||
"33102"#ndim":\n\t"\
|
||||
"cmpq $4,%7;jb 33103"#ndim"f;"\
|
||||
COMPUTE_m4(ndim)\
|
||||
"subq $4,%7;"\
|
||||
"33103"#ndim":\n\t"\
|
||||
"cmpq $2,%7;jb 33104"#ndim"f;"\
|
||||
COMPUTE_m2(ndim)\
|
||||
"subq $2,%7;"\
|
||||
"33104"#ndim":\n\t"\
|
||||
"testq %7,%7;jz 33105"#ndim"f;"\
|
||||
COMPUTE_m1(ndim)\
|
||||
"33105"#ndim":\n\t"\
|
||||
"movq %%r13,%4; movq %%r14,%1; movq %%r11,%7;"\
|
||||
:"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(K),"+r"(ctemp),"+r"(alp),"+r"(M),"+r"(next_b)\
|
||||
::"r11","r12","r13","r14","r15","zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14",\
|
||||
"zmm15","zmm16","zmm17","zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31",\
|
||||
"cc","memory");\
|
||||
a_pointer -= M * K; b_pointer += ndim * K;c_pointer += LDC * ndim - M;\
|
||||
}
|
||||
int __attribute__ ((noinline))
|
||||
CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG LDC)
|
||||
{
|
||||
if(m==0||n==0||k==0||alpha==(float)0.0) return 0;
|
||||
int64_t ldc_in_bytes = (int64_t)LDC * sizeof(float);float ALPHA = alpha;
|
||||
int64_t M = (int64_t)m, K = (int64_t)k;
|
||||
BLASLONG n_count = n;
|
||||
float *a_pointer = A,*b_pointer = B,*c_pointer = C,*ctemp = C,*alp = &ALPHA,*next_b = B;
|
||||
for(;n_count>23;n_count-=24) COMPUTE(24)
|
||||
for(;n_count>19;n_count-=20) COMPUTE(20)
|
||||
for(;n_count>15;n_count-=16) COMPUTE(16)
|
||||
for(;n_count>11;n_count-=12) COMPUTE(12)
|
||||
for(;n_count>7;n_count-=8) COMPUTE(8)
|
||||
for(;n_count>3;n_count-=4) COMPUTE(4)
|
||||
for(;n_count>1;n_count-=2) COMPUTE(2)
|
||||
if(n_count>0) COMPUTE(1)
|
||||
return 0;
|
||||
}
|
||||
|
||||
#include <immintrin.h>
|
||||
/* codes below are copied from the sgemm kernel written by Arjan van der Ven */
|
||||
|
||||
/*
|
||||
* "Direct sgemm" code. This code operates directly on the inputs and outputs
|
||||
* of the sgemm call, avoiding the copies, memory realignments and threading,
|
||||
* and only supports alpha = 1 and beta = 0.
|
||||
* This is a common case and provides value for relatively small matrixes.
|
||||
* For larger matrixes the "regular" sgemm code is superior, there the cost of
|
||||
* copying/shuffling the B matrix really pays off.
|
||||
*/
|
||||
|
||||
|
||||
|
||||
#define DECLARE_RESULT_512(N,M) __m512 result##N##M = _mm512_setzero_ps()
|
||||
#define BROADCAST_LOAD_A_512(N,M) __m512 Aval##M = _mm512_broadcastss_ps(_mm_load_ss(&A[k + strideA * (i+M)]))
|
||||
#define LOAD_B_512(N,M) __m512 Bval##N = _mm512_loadu_ps(&B[strideB * k + j + (N*16)])
|
||||
#define MATMUL_512(N,M) result##N##M = _mm512_fmadd_ps(Aval##M, Bval##N , result##N##M)
|
||||
#define STORE_512(N,M) _mm512_storeu_ps(&R[(i+M) * strideR + j+(N*16)], result##N##M)
|
||||
|
||||
|
||||
#define DECLARE_RESULT_256(N,M) __m256 result##N##M = _mm256_setzero_ps()
|
||||
#define BROADCAST_LOAD_A_256(N,M) __m256 Aval##M = _mm256_broadcastss_ps(_mm_load_ss(&A[k + strideA * (i+M)]))
|
||||
#define LOAD_B_256(N,M) __m256 Bval##N = _mm256_loadu_ps(&B[strideB * k + j + (N*8)])
|
||||
#define MATMUL_256(N,M) result##N##M = _mm256_fmadd_ps(Aval##M, Bval##N , result##N##M)
|
||||
#define STORE_256(N,M) _mm256_storeu_ps(&R[(i+M) * strideR + j+(N*8)], result##N##M)
|
||||
|
||||
#define DECLARE_RESULT_128(N,M) __m128 result##N##M = _mm_setzero_ps()
|
||||
#define BROADCAST_LOAD_A_128(N,M) __m128 Aval##M = _mm_broadcastss_ps(_mm_load_ss(&A[k + strideA * (i+M)]))
|
||||
#define LOAD_B_128(N,M) __m128 Bval##N = _mm_loadu_ps(&B[strideB * k + j + (N*4)])
|
||||
#define MATMUL_128(N,M) result##N##M = _mm_fmadd_ps(Aval##M, Bval##N , result##N##M)
|
||||
#define STORE_128(N,M) _mm_storeu_ps(&R[(i+M) * strideR + j+(N*4)], result##N##M)
|
||||
|
||||
#define DECLARE_RESULT_SCALAR(N,M) float result##N##M = 0;
|
||||
#define BROADCAST_LOAD_A_SCALAR(N,M) float Aval##M = A[k + strideA * (i + M)];
|
||||
#define LOAD_B_SCALAR(N,M) float Bval##N = B[k * strideB + j + N];
|
||||
#define MATMUL_SCALAR(N,M) result##N##M += Aval##M * Bval##N;
|
||||
#define STORE_SCALAR(N,M) R[(i+M) * strideR + j + N] = result##N##M;
|
||||
|
||||
int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K)
|
||||
{
|
||||
unsigned long long mnk = M * N * K;
|
||||
/* large matrixes -> not performant */
|
||||
if (mnk >= 28 * 512 * 512)
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* if the B matrix is not a nice multiple if 4 we get many unaligned accesses,
|
||||
* and the regular sgemm copy/realignment of data pays off much quicker
|
||||
*/
|
||||
if ((N & 3) != 0 && (mnk >= 8 * 512 * 512))
|
||||
return 0;
|
||||
|
||||
#ifdef SMP
|
||||
/* if we can run multithreaded, the threading changes the based threshold */
|
||||
if (mnk > 2 * 350 * 512 && num_cpu_avail(3)> 1)
|
||||
return 0;
|
||||
#endif
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR)
|
||||
{
|
||||
int i, j, k;
|
||||
|
||||
int m4 = M & ~3;
|
||||
int m2 = M & ~1;
|
||||
|
||||
int n64 = N & ~63;
|
||||
int n32 = N & ~31;
|
||||
int n16 = N & ~15;
|
||||
int n8 = N & ~7;
|
||||
int n4 = N & ~3;
|
||||
int n2 = N & ~1;
|
||||
|
||||
i = 0;
|
||||
|
||||
for (i = 0; i < m4; i+=4) {
|
||||
|
||||
for (j = 0; j < n64; j+= 64) {
|
||||
k = 0;
|
||||
DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
|
||||
DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
|
||||
DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2);
|
||||
DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3);
|
||||
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
BROADCAST_LOAD_A_512(x, 0);
|
||||
BROADCAST_LOAD_A_512(x, 1);
|
||||
BROADCAST_LOAD_A_512(x, 2);
|
||||
BROADCAST_LOAD_A_512(x, 3);
|
||||
|
||||
LOAD_B_512(0, x); LOAD_B_512(1, x); LOAD_B_512(2, x); LOAD_B_512(3, x);
|
||||
|
||||
MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
|
||||
MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
|
||||
MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2);
|
||||
MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3);
|
||||
}
|
||||
STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0);
|
||||
STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1);
|
||||
STORE_512(0, 2); STORE_512(1, 2); STORE_512(2, 2); STORE_512(3, 2);
|
||||
STORE_512(0, 3); STORE_512(1, 3); STORE_512(2, 3); STORE_512(3, 3);
|
||||
}
|
||||
|
||||
for (; j < n32; j+= 32) {
|
||||
DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
|
||||
DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
|
||||
DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2);
|
||||
DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3);
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
BROADCAST_LOAD_A_512(x, 0);
|
||||
BROADCAST_LOAD_A_512(x, 1);
|
||||
BROADCAST_LOAD_A_512(x, 2);
|
||||
BROADCAST_LOAD_A_512(x, 3);
|
||||
|
||||
LOAD_B_512(0, x); LOAD_B_512(1, x);
|
||||
|
||||
MATMUL_512(0, 0); MATMUL_512(1, 0);
|
||||
MATMUL_512(0, 1); MATMUL_512(1, 1);
|
||||
MATMUL_512(0, 2); MATMUL_512(1, 2);
|
||||
MATMUL_512(0, 3); MATMUL_512(1, 3);
|
||||
}
|
||||
STORE_512(0, 0); STORE_512(1, 0);
|
||||
STORE_512(0, 1); STORE_512(1, 1);
|
||||
STORE_512(0, 2); STORE_512(1, 2);
|
||||
STORE_512(0, 3); STORE_512(1, 3);
|
||||
}
|
||||
|
||||
for (; j < n16; j+= 16) {
|
||||
DECLARE_RESULT_512(0, 0);
|
||||
DECLARE_RESULT_512(0, 1);
|
||||
DECLARE_RESULT_512(0, 2);
|
||||
DECLARE_RESULT_512(0, 3);
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
BROADCAST_LOAD_A_512(x, 0);
|
||||
BROADCAST_LOAD_A_512(x, 1);
|
||||
BROADCAST_LOAD_A_512(x, 2);
|
||||
BROADCAST_LOAD_A_512(x, 3);
|
||||
|
||||
LOAD_B_512(0, x);
|
||||
|
||||
MATMUL_512(0, 0);
|
||||
MATMUL_512(0, 1);
|
||||
MATMUL_512(0, 2);
|
||||
MATMUL_512(0, 3);
|
||||
}
|
||||
STORE_512(0, 0);
|
||||
STORE_512(0, 1);
|
||||
STORE_512(0, 2);
|
||||
STORE_512(0, 3);
|
||||
}
|
||||
|
||||
for (; j < n8; j+= 8) {
|
||||
DECLARE_RESULT_256(0, 0);
|
||||
DECLARE_RESULT_256(0, 1);
|
||||
DECLARE_RESULT_256(0, 2);
|
||||
DECLARE_RESULT_256(0, 3);
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
BROADCAST_LOAD_A_256(x, 0);
|
||||
BROADCAST_LOAD_A_256(x, 1);
|
||||
BROADCAST_LOAD_A_256(x, 2);
|
||||
BROADCAST_LOAD_A_256(x, 3);
|
||||
|
||||
LOAD_B_256(0, x);
|
||||
|
||||
MATMUL_256(0, 0);
|
||||
MATMUL_256(0, 1);
|
||||
MATMUL_256(0, 2);
|
||||
MATMUL_256(0, 3);
|
||||
}
|
||||
STORE_256(0, 0);
|
||||
STORE_256(0, 1);
|
||||
STORE_256(0, 2);
|
||||
STORE_256(0, 3);
|
||||
}
|
||||
|
||||
for (; j < n4; j+= 4) {
|
||||
DECLARE_RESULT_128(0, 0);
|
||||
DECLARE_RESULT_128(0, 1);
|
||||
DECLARE_RESULT_128(0, 2);
|
||||
DECLARE_RESULT_128(0, 3);
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
BROADCAST_LOAD_A_128(x, 0);
|
||||
BROADCAST_LOAD_A_128(x, 1);
|
||||
BROADCAST_LOAD_A_128(x, 2);
|
||||
BROADCAST_LOAD_A_128(x, 3);
|
||||
|
||||
LOAD_B_128(0, x);
|
||||
|
||||
MATMUL_128(0, 0);
|
||||
MATMUL_128(0, 1);
|
||||
MATMUL_128(0, 2);
|
||||
MATMUL_128(0, 3);
|
||||
}
|
||||
STORE_128(0, 0);
|
||||
STORE_128(0, 1);
|
||||
STORE_128(0, 2);
|
||||
STORE_128(0, 3);
|
||||
}
|
||||
|
||||
for (; j < n2; j+= 2) {
|
||||
DECLARE_RESULT_SCALAR(0, 0); DECLARE_RESULT_SCALAR(1, 0);
|
||||
DECLARE_RESULT_SCALAR(0, 1); DECLARE_RESULT_SCALAR(1, 1);
|
||||
DECLARE_RESULT_SCALAR(0, 2); DECLARE_RESULT_SCALAR(1, 2);
|
||||
DECLARE_RESULT_SCALAR(0, 3); DECLARE_RESULT_SCALAR(1, 3);
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
BROADCAST_LOAD_A_SCALAR(x, 0);
|
||||
BROADCAST_LOAD_A_SCALAR(x, 1);
|
||||
BROADCAST_LOAD_A_SCALAR(x, 2);
|
||||
BROADCAST_LOAD_A_SCALAR(x, 3);
|
||||
|
||||
LOAD_B_SCALAR(0, x); LOAD_B_SCALAR(1, x);
|
||||
|
||||
MATMUL_SCALAR(0, 0); MATMUL_SCALAR(1, 0);
|
||||
MATMUL_SCALAR(0, 1); MATMUL_SCALAR(1, 1);
|
||||
MATMUL_SCALAR(0, 2); MATMUL_SCALAR(1, 2);
|
||||
MATMUL_SCALAR(0, 3); MATMUL_SCALAR(1, 3);
|
||||
}
|
||||
STORE_SCALAR(0, 0); STORE_SCALAR(1, 0);
|
||||
STORE_SCALAR(0, 1); STORE_SCALAR(1, 1);
|
||||
STORE_SCALAR(0, 2); STORE_SCALAR(1, 2);
|
||||
STORE_SCALAR(0, 3); STORE_SCALAR(1, 3);
|
||||
}
|
||||
|
||||
for (; j < N; j++) {
|
||||
DECLARE_RESULT_SCALAR(0, 0)
|
||||
DECLARE_RESULT_SCALAR(0, 1)
|
||||
DECLARE_RESULT_SCALAR(0, 2)
|
||||
DECLARE_RESULT_SCALAR(0, 3)
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
BROADCAST_LOAD_A_SCALAR(0, 0);
|
||||
BROADCAST_LOAD_A_SCALAR(0, 1);
|
||||
BROADCAST_LOAD_A_SCALAR(0, 2);
|
||||
BROADCAST_LOAD_A_SCALAR(0, 3);
|
||||
|
||||
LOAD_B_SCALAR(0, 0);
|
||||
|
||||
MATMUL_SCALAR(0, 0);
|
||||
MATMUL_SCALAR(0, 1);
|
||||
MATMUL_SCALAR(0, 2);
|
||||
MATMUL_SCALAR(0, 3);
|
||||
}
|
||||
STORE_SCALAR(0, 0);
|
||||
STORE_SCALAR(0, 1);
|
||||
STORE_SCALAR(0, 2);
|
||||
STORE_SCALAR(0, 3);
|
||||
}
|
||||
}
|
||||
|
||||
for (; i < m2; i+=2) {
|
||||
j = 0;
|
||||
|
||||
for (; j < n64; j+= 64) {
|
||||
DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
|
||||
DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
|
||||
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
BROADCAST_LOAD_A_512(x, 0);
|
||||
BROADCAST_LOAD_A_512(x, 1);
|
||||
|
||||
LOAD_B_512(0, x); LOAD_B_512(1, x); LOAD_B_512(2, x); LOAD_B_512(3, x);
|
||||
|
||||
MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
|
||||
MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
|
||||
}
|
||||
STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0);
|
||||
STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1);
|
||||
}
|
||||
|
||||
for (; j < n32; j+= 32) {
|
||||
DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
|
||||
DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
BROADCAST_LOAD_A_512(x, 0);
|
||||
BROADCAST_LOAD_A_512(x, 1);
|
||||
|
||||
LOAD_B_512(0, x); LOAD_B_512(1, x);
|
||||
|
||||
MATMUL_512(0, 0); MATMUL_512(1, 0);
|
||||
MATMUL_512(0, 1); MATMUL_512(1, 1);
|
||||
}
|
||||
STORE_512(0, 0); STORE_512(1, 0);
|
||||
STORE_512(0, 1); STORE_512(1, 1);
|
||||
}
|
||||
|
||||
|
||||
for (; j < n16; j+= 16) {
|
||||
DECLARE_RESULT_512(0, 0);
|
||||
DECLARE_RESULT_512(0, 1);
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
BROADCAST_LOAD_A_512(x, 0);
|
||||
BROADCAST_LOAD_A_512(x, 1);
|
||||
|
||||
LOAD_B_512(0, x);
|
||||
|
||||
MATMUL_512(0, 0);
|
||||
MATMUL_512(0, 1);
|
||||
}
|
||||
STORE_512(0, 0);
|
||||
STORE_512(0, 1);
|
||||
}
|
||||
|
||||
for (; j < n8; j+= 8) {
|
||||
DECLARE_RESULT_256(0, 0);
|
||||
DECLARE_RESULT_256(0, 1);
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
BROADCAST_LOAD_A_256(x, 0);
|
||||
BROADCAST_LOAD_A_256(x, 1);
|
||||
|
||||
LOAD_B_256(0, x);
|
||||
|
||||
MATMUL_256(0, 0);
|
||||
MATMUL_256(0, 1);
|
||||
}
|
||||
STORE_256(0, 0);
|
||||
STORE_256(0, 1);
|
||||
}
|
||||
|
||||
for (; j < n4; j+= 4) {
|
||||
DECLARE_RESULT_128(0, 0);
|
||||
DECLARE_RESULT_128(0, 1);
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
BROADCAST_LOAD_A_128(x, 0);
|
||||
BROADCAST_LOAD_A_128(x, 1);
|
||||
|
||||
LOAD_B_128(0, x);
|
||||
|
||||
MATMUL_128(0, 0);
|
||||
MATMUL_128(0, 1);
|
||||
}
|
||||
STORE_128(0, 0);
|
||||
STORE_128(0, 1);
|
||||
}
|
||||
for (; j < n2; j+= 2) {
|
||||
DECLARE_RESULT_SCALAR(0, 0); DECLARE_RESULT_SCALAR(1, 0);
|
||||
DECLARE_RESULT_SCALAR(0, 1); DECLARE_RESULT_SCALAR(1, 1);
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
BROADCAST_LOAD_A_SCALAR(x, 0);
|
||||
BROADCAST_LOAD_A_SCALAR(x, 1);
|
||||
|
||||
LOAD_B_SCALAR(0, x); LOAD_B_SCALAR(1, x);
|
||||
|
||||
MATMUL_SCALAR(0, 0); MATMUL_SCALAR(1, 0);
|
||||
MATMUL_SCALAR(0, 1); MATMUL_SCALAR(1, 1);
|
||||
}
|
||||
STORE_SCALAR(0, 0); STORE_SCALAR(1, 0);
|
||||
STORE_SCALAR(0, 1); STORE_SCALAR(1, 1);
|
||||
}
|
||||
|
||||
for (; j < N; j++) {
|
||||
DECLARE_RESULT_SCALAR(0, 0);
|
||||
DECLARE_RESULT_SCALAR(0, 1);
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
BROADCAST_LOAD_A_SCALAR(0, 0);
|
||||
BROADCAST_LOAD_A_SCALAR(0, 1);
|
||||
|
||||
LOAD_B_SCALAR(0, 0);
|
||||
|
||||
MATMUL_SCALAR(0, 0);
|
||||
MATMUL_SCALAR(0, 1);
|
||||
}
|
||||
STORE_SCALAR(0, 0);
|
||||
STORE_SCALAR(0, 1);
|
||||
}
|
||||
}
|
||||
|
||||
for (; i < M; i+=1) {
|
||||
j = 0;
|
||||
for (; j < n64; j+= 64) {
|
||||
DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
BROADCAST_LOAD_A_512(x, 0);
|
||||
LOAD_B_512(0, x); LOAD_B_512(1, x); LOAD_B_512(2, x); LOAD_B_512(3, x);
|
||||
MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
|
||||
}
|
||||
STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0);
|
||||
}
|
||||
for (; j < n32; j+= 32) {
|
||||
DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
BROADCAST_LOAD_A_512(x, 0);
|
||||
LOAD_B_512(0, x); LOAD_B_512(1, x);
|
||||
MATMUL_512(0, 0); MATMUL_512(1, 0);
|
||||
}
|
||||
STORE_512(0, 0); STORE_512(1, 0);
|
||||
}
|
||||
|
||||
|
||||
for (; j < n16; j+= 16) {
|
||||
DECLARE_RESULT_512(0, 0);
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
BROADCAST_LOAD_A_512(x, 0);
|
||||
|
||||
LOAD_B_512(0, x);
|
||||
|
||||
MATMUL_512(0, 0);
|
||||
}
|
||||
STORE_512(0, 0);
|
||||
}
|
||||
|
||||
for (; j < n8; j+= 8) {
|
||||
DECLARE_RESULT_256(0, 0);
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
BROADCAST_LOAD_A_256(x, 0);
|
||||
LOAD_B_256(0, x);
|
||||
MATMUL_256(0, 0);
|
||||
}
|
||||
STORE_256(0, 0);
|
||||
}
|
||||
|
||||
for (; j < n4; j+= 4) {
|
||||
DECLARE_RESULT_128(0, 0);
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
BROADCAST_LOAD_A_128(x, 0);
|
||||
LOAD_B_128(0, x);
|
||||
MATMUL_128(0, 0);
|
||||
}
|
||||
STORE_128(0, 0);
|
||||
}
|
||||
|
||||
for (; j < n2; j+= 2) {
|
||||
DECLARE_RESULT_SCALAR(0, 0); DECLARE_RESULT_SCALAR(1, 0);
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
BROADCAST_LOAD_A_SCALAR(x, 0);
|
||||
LOAD_B_SCALAR(0, 0); LOAD_B_SCALAR(1, 0);
|
||||
MATMUL_SCALAR(0, 0); MATMUL_SCALAR(1, 0);
|
||||
}
|
||||
STORE_SCALAR(0, 0); STORE_SCALAR(1, 0);
|
||||
}
|
||||
|
||||
for (; j < N; j++) {
|
||||
DECLARE_RESULT_SCALAR(0, 0);
|
||||
|
||||
for (k = 0; k < K; k++) {
|
||||
BROADCAST_LOAD_A_SCALAR(0, 0);
|
||||
LOAD_B_SCALAR(0, 0);
|
||||
MATMUL_SCALAR(0, 0);
|
||||
}
|
||||
STORE_SCALAR(0, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
283
kernel/x86_64/zgemm_kernel_4x2_skylakex.c
Normal file
283
kernel/x86_64/zgemm_kernel_4x2_skylakex.c
Normal file
@@ -0,0 +1,283 @@
|
||||
#include "common.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
#define ZGEMM_SKX_MODE 0 //not to do conjugation on a_block and b_block
|
||||
#endif
|
||||
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||
#define ZGEMM_SKX_MODE 1 //do conjugation on a_block, not b_block
|
||||
#endif
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||
#define ZGEMM_SKX_MODE 2 //do conjugation on b_block, not a_block
|
||||
#endif
|
||||
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
#define ZGEMM_SKX_MODE 3 //do conjugation on a_block and b_block
|
||||
#endif
|
||||
|
||||
// recommended settings: GEMM_DEFAULT_Q = 128, GEMM_DEFAULT_P = 256
|
||||
/* %0=a_pointer, %1=b_pointer, %2=c_pointer, %3=c_store, %4=ldc(bytes), %5=&constval, %6 = k_counter, %7 = m_counter, %8 = b_pref */
|
||||
// const double constval[4] = {alpha_r, alpha_i, -1, 1};
|
||||
/* r11 = m; r12 = k * 32; r13 = k; r14 = b_head; r15 = %1 + r12 * 3; */
|
||||
#define GENERAL_INIT "movq %7,%%r11; movq %1,%%r14; movq %6,%%r13; movq %6,%%r12; salq $5,%%r12;"
|
||||
#define GENERAL_RECOVER "movq %%r11,%7; movq %%r13,%6; movq %%r14,%1;"
|
||||
#define CONSTZMM_INIT "vbroadcastsd (%5),%%zmm0; vbroadcastsd 8(%5),%%zmm1; vbroadcastf32x4 16(%5),%%zmm2;"
|
||||
#define COMPUTE_INIT "movq %%r13,%6; movq %%r14,%1; leaq (%%r14,%%r12,2),%%r15; addq %%r12,%%r15;"
|
||||
|
||||
/* m=4, zmm0=alpha_r, zmm1=alpha_i, zmm2={-1,1,...,-1,1}, zmm3-zmm7 for temporary use, zmm8-zmm31 for accumulators */
|
||||
#if ZGEMM_SKX_MODE == 0 || ZGEMM_SKX_MODE == 2 //not to do conjugation on a_block
|
||||
#define unit_kernel_k1m4n1(a_r,a_i,b_off,c_le,c_ri,...) \
|
||||
"vbroadcastf32x4 "#b_off"("#__VA_ARGS__"),%%zmm3; vfmadd231pd "#a_r",%%zmm3,"#c_le"; vfmadd231pd "#a_i",%%zmm3,"#c_ri";"
|
||||
#else //do conjugation on a_block
|
||||
#define unit_kernel_k1m4n1(a_r,a_i,b_off,c_le,c_ri,...) \
|
||||
"vbroadcastf32x4 "#b_off"("#__VA_ARGS__"),%%zmm3; vfmadd231pd "#a_r",%%zmm3,"#c_le"; vfnmadd231pd "#a_i",%%zmm3,"#c_ri";"
|
||||
#endif
|
||||
#define KERNEL_h_k1m4n1 \
|
||||
"vmovddup (%0),%%zmm4; vmovddup 8(%0),%%zmm5; prefetcht0 512(%0); addq $64,%0;"\
|
||||
unit_kernel_k1m4n1(%%zmm4,%%zmm5,0,%%zmm8,%%zmm9,%1)
|
||||
#define KERNEL_t_k1m4n1 KERNEL_h_k1m4n1 "addq $16,%1;"
|
||||
#define KERNEL_h_k1m4n2 KERNEL_h_k1m4n1 unit_kernel_k1m4n1(%%zmm4,%%zmm5,16,%%zmm10,%%zmm11,%1)
|
||||
#define KERNEL_t_k1m4n2 KERNEL_h_k1m4n2 "addq $32,%1;"
|
||||
#define unit_kernel_k1m4n2(c1le,c1ri,c2le,c2ri,...) \
|
||||
unit_kernel_k1m4n1(%%zmm4,%%zmm5,0,c1le,c1ri,__VA_ARGS__)\
|
||||
unit_kernel_k1m4n1(%%zmm4,%%zmm5,16,c2le,c2ri,__VA_ARGS__)
|
||||
#define KERNEL_h_k1m4n4 KERNEL_h_k1m4n2 unit_kernel_k1m4n2(%%zmm12,%%zmm13,%%zmm14,%%zmm15,%1,%%r12,1)
|
||||
#define KERNEL_t_k1m4n4 KERNEL_h_k1m4n4 "addq $32,%1;"
|
||||
#define KERNEL_t_k1m4n6 KERNEL_h_k1m4n4 unit_kernel_k1m4n2(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%1,%%r12,2) "addq $32,%1;"
|
||||
#define KERNEL_h_k1m4n8 KERNEL_t_k1m4n6 unit_kernel_k1m4n2(%%zmm20,%%zmm21,%%zmm22,%%zmm23,%%r15)
|
||||
#define KERNEL_t_k1m4n8 KERNEL_h_k1m4n8 "addq $32,%%r15;"
|
||||
#define KERNEL_h_k1m4n10 KERNEL_h_k1m4n8 unit_kernel_k1m4n2(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%r15,%%r12,1)
|
||||
#define KERNEL_t_k1m4n10 KERNEL_h_k1m4n10 "addq $32,%%r15;"
|
||||
#define KERNEL_h_k1m4n12 KERNEL_h_k1m4n10 unit_kernel_k1m4n2(%%zmm28,%%zmm29,%%zmm30,%%zmm31,%%r15,%%r12,2)
|
||||
#define KERNEL_t_k1m4n12 KERNEL_h_k1m4n12 "addq $32,%%r15;"
|
||||
#if ZGEMM_SKX_MODE == 0 || ZGEMM_SKX_MODE == 1 //not to do conjugation on b_block
|
||||
#define unit_save_m4n1(c_le,c_ri,...) \
|
||||
"vpermilpd $85,"#c_ri","#c_ri"; vfmadd231pd "#c_ri",%%zmm2,"#c_le"; vpermilpd $85,"#c_le",%%zmm4;"\
|
||||
"vfmaddsub213pd ("#__VA_ARGS__"),%%zmm1,%%zmm4; vfmaddsub213pd %%zmm4,%%zmm0,"#c_le"; vmovupd "#c_le",("#__VA_ARGS__");"
|
||||
#else //do conjugation on b_block
|
||||
#define unit_save_m4n1(c_le,c_ri,...) \
|
||||
"vpermilpd $85,"#c_ri","#c_ri"; vfnmadd231pd "#c_ri",%%zmm2,"#c_le"; vpermilpd $85,"#c_le",%%zmm4;"\
|
||||
"vfmsubadd213pd ("#__VA_ARGS__"),%%zmm0,"#c_le"; vfmsubadd231pd %%zmm4,%%zmm1,"#c_le"; vmovupd "#c_le",("#__VA_ARGS__");"
|
||||
#endif
|
||||
#define SAVE_SETUP_m4 "movq %2,%3; addq $64,%2;"
|
||||
#define SAVE_m4n1 SAVE_SETUP_m4 unit_save_m4n1(%%zmm8,%%zmm9,%3)
|
||||
#define SAVE_m4n2 SAVE_m4n1 unit_save_m4n1(%%zmm10,%%zmm11,%3,%4,1)
|
||||
#define unit_save_m4n2(c1le,c1ri,c2le,c2ri) \
|
||||
"leaq (%3,%4,2),%3;" unit_save_m4n1(c1le,c1ri,%3) unit_save_m4n1(c2le,c2ri,%3,%4,1)
|
||||
#define SAVE_m4n4 SAVE_m4n2 unit_save_m4n2(%%zmm12,%%zmm13,%%zmm14,%%zmm15)
|
||||
#define SAVE_m4n6 SAVE_m4n4 unit_save_m4n2(%%zmm16,%%zmm17,%%zmm18,%%zmm19)
|
||||
#define SAVE_m4n8 SAVE_m4n6 unit_save_m4n2(%%zmm20,%%zmm21,%%zmm22,%%zmm23)
|
||||
#define SAVE_m4n10 SAVE_m4n8 unit_save_m4n2(%%zmm24,%%zmm25,%%zmm26,%%zmm27)
|
||||
#define SAVE_m4n12 SAVE_m4n10 unit_save_m4n2(%%zmm28,%%zmm29,%%zmm30,%%zmm31)
|
||||
#define unit_init_m4n1(c_le,c_ri) "vpxorq "#c_le","#c_le","#c_le"; vpxorq "#c_ri","#c_ri","#c_ri";"
|
||||
#define INIT_m4n1 unit_init_m4n1(%%zmm8,%%zmm9)
|
||||
#define INIT_m4n2 INIT_m4n1 unit_init_m4n1(%%zmm10,%%zmm11)
|
||||
#define INIT_m4n4 INIT_m4n2 unit_init_m4n1(%%zmm12,%%zmm13) unit_init_m4n1(%%zmm14,%%zmm15)
|
||||
#define INIT_m4n6 INIT_m4n4 unit_init_m4n1(%%zmm16,%%zmm17) unit_init_m4n1(%%zmm18,%%zmm19)
|
||||
#define INIT_m4n8 INIT_m4n6 unit_init_m4n1(%%zmm20,%%zmm21) unit_init_m4n1(%%zmm22,%%zmm23)
|
||||
#define INIT_m4n10 INIT_m4n8 unit_init_m4n1(%%zmm24,%%zmm25) unit_init_m4n1(%%zmm26,%%zmm27)
|
||||
#define INIT_m4n12 INIT_m4n10 unit_init_m4n1(%%zmm28,%%zmm29) unit_init_m4n1(%%zmm30,%%zmm31)
|
||||
#define COMPUTE_m4(ndim) \
|
||||
INIT_m4n##ndim\
|
||||
COMPUTE_INIT "movq %2,%3;"\
|
||||
"cmpq $20,%6; jb "#ndim"88440f;"\
|
||||
#ndim"88449:\n\t"\
|
||||
KERNEL_t_k1m4n##ndim\
|
||||
KERNEL_t_k1m4n##ndim\
|
||||
KERNEL_t_k1m4n##ndim\
|
||||
"prefetcht1 (%3); prefetcht1 63(%3); addq %4,%3;"\
|
||||
KERNEL_t_k1m4n##ndim\
|
||||
KERNEL_t_k1m4n##ndim\
|
||||
KERNEL_t_k1m4n##ndim\
|
||||
"prefetcht1 (%8); addq $24,%8;"\
|
||||
"subq $6,%6; cmpq $20,%6; jnb "#ndim"88449b;"\
|
||||
"movq %2,%3;"\
|
||||
#ndim"88440:\n\t"\
|
||||
"testq %6,%6; jz "#ndim"88441f;"\
|
||||
"prefetcht0 (%3); prefetcht0 63(%3); addq %4,%3;"\
|
||||
KERNEL_t_k1m4n##ndim\
|
||||
"decq %6; jmp "#ndim"88440b;"\
|
||||
#ndim"88441:\n\t"\
|
||||
SAVE_m4n##ndim
|
||||
|
||||
/* m=2, ymm0-ymm3 for temporary use, ymm4-ymm15 for accumulators */
|
||||
#if ZGEMM_SKX_MODE == 0 || ZGEMM_SKX_MODE == 3 //conjg_a == conjg_b; ap = permilpd($5,a0)
|
||||
#define unit_kernel_k1m2n1(a0,ap,b_off_r,b_off_i,c1,...) \
|
||||
"vbroadcastsd "#b_off_i"("#__VA_ARGS__"),%%ymm2; vfmaddsub231pd "#ap",%%ymm2,"#c1";"\
|
||||
"vbroadcastsd "#b_off_r"("#__VA_ARGS__"),%%ymm2; vfmaddsub231pd "#a0",%%ymm2,"#c1";"
|
||||
#else //conjg_a != conjg_b
|
||||
#define unit_kernel_k1m2n1(a0,ap,b_off_r,b_off_i,c1,...) \
|
||||
"vbroadcastsd "#b_off_i"("#__VA_ARGS__"),%%ymm2; vfmsubadd231pd "#ap",%%ymm2,"#c1";"\
|
||||
"vbroadcastsd "#b_off_r"("#__VA_ARGS__"),%%ymm2; vfmsubadd231pd "#a0",%%ymm2,"#c1";"
|
||||
#endif
|
||||
#define KERNEL_h_k1m2n1 \
|
||||
"vmovupd (%0),%%ymm0; vpermilpd $5,%%ymm0,%%ymm1; addq $32,%0;"\
|
||||
unit_kernel_k1m2n1(%%ymm0,%%ymm1,0,8,%%ymm4,%1)
|
||||
#define KERNEL_t_k1m2n1 KERNEL_h_k1m2n1 "addq $16,%1;"
|
||||
#define KERNEL_h_k1m2n2 KERNEL_h_k1m2n1 unit_kernel_k1m2n1(%%ymm0,%%ymm1,16,24,%%ymm5,%1)
|
||||
#define KERNEL_t_k1m2n2 KERNEL_h_k1m2n2 "addq $32,%1;"
|
||||
#define unit_kernel_k1m2n2(c1,c2,...) \
|
||||
unit_kernel_k1m2n1(%%ymm0,%%ymm1,0,8,c1,__VA_ARGS__)\
|
||||
unit_kernel_k1m2n1(%%ymm0,%%ymm1,16,24,c2,__VA_ARGS__)
|
||||
#define KERNEL_h_k1m2n4 KERNEL_h_k1m2n2 unit_kernel_k1m2n2(%%ymm6,%%ymm7,%1,%%r12,1)
|
||||
#define KERNEL_t_k1m2n4 KERNEL_h_k1m2n4 "addq $32,%1;"
|
||||
#define KERNEL_t_k1m2n6 KERNEL_h_k1m2n4 unit_kernel_k1m2n2(%%ymm8,%%ymm9,%1,%%r12,2) "addq $32,%1;"
|
||||
#define KERNEL_h_k1m2n8 KERNEL_t_k1m2n6 unit_kernel_k1m2n2(%%ymm10,%%ymm11,%%r15)
|
||||
#define KERNEL_t_k1m2n8 KERNEL_h_k1m2n8 "addq $32,%%r15;"
|
||||
#define KERNEL_h_k1m2n10 KERNEL_h_k1m2n8 unit_kernel_k1m2n2(%%ymm12,%%ymm13,%%r15,%%r12,1)
|
||||
#define KERNEL_t_k1m2n10 KERNEL_h_k1m2n10 "addq $32,%%r15;"
|
||||
#define KERNEL_h_k1m2n12 KERNEL_h_k1m2n10 unit_kernel_k1m2n2(%%ymm14,%%ymm15,%%r15,%%r12,2)
|
||||
#define KERNEL_t_k1m2n12 KERNEL_h_k1m2n12 "addq $32,%%r15;"
|
||||
#if ZGEMM_SKX_MODE == 0 || ZGEMM_SKX_MODE == 2 //not to do conjugation on a_block
|
||||
#define unit_save_m2n1(alp_r,alp_i,c1,...) \
|
||||
"vpermilpd $5,"#c1",%%ymm3; vfmaddsub213pd ("#__VA_ARGS__"),"#alp_i",%%ymm3;"\
|
||||
"vfmaddsub213pd %%ymm3,"#alp_r","#c1";vmovupd "#c1",("#__VA_ARGS__");"
|
||||
#else //do conjugation on a_block
|
||||
#define unit_save_m2n1(alp_r,alp_i,c1,...) \
|
||||
"vpermilpd $5,"#c1",%%ymm3; vfmsubadd213pd ("#__VA_ARGS__"),"#alp_r","#c1";"\
|
||||
"vfmsubadd231pd %%ymm3,"#alp_i","#c1";vmovupd "#c1",("#__VA_ARGS__");"
|
||||
#endif
|
||||
#define SAVE_SETUP_m2 "movq %2,%3; addq $32,%2; vbroadcastsd (%5),%%ymm0; vbroadcastsd 8(%5),%%ymm1;"
|
||||
#define SAVE_m2n1 SAVE_SETUP_m2 unit_save_m2n1(%%ymm0,%%ymm1,%%ymm4,%3)
|
||||
#define SAVE_m2n2 SAVE_m2n1 unit_save_m2n1(%%ymm0,%%ymm1,%%ymm5,%3,%4,1)
|
||||
#define unit_save_m2n2(c1,c2) \
|
||||
"leaq (%3,%4,2),%3;" unit_save_m2n1(%%ymm0,%%ymm1,c1,%3) unit_save_m2n1(%%ymm0,%%ymm1,c2,%3,%4,1)
|
||||
#define SAVE_m2n4 SAVE_m2n2 unit_save_m2n2(%%ymm6,%%ymm7)
|
||||
#define SAVE_m2n6 SAVE_m2n4 unit_save_m2n2(%%ymm8,%%ymm9)
|
||||
#define SAVE_m2n8 SAVE_m2n6 unit_save_m2n2(%%ymm10,%%ymm11)
|
||||
#define SAVE_m2n10 SAVE_m2n8 unit_save_m2n2(%%ymm12,%%ymm13)
|
||||
#define SAVE_m2n12 SAVE_m2n10 unit_save_m2n2(%%ymm14,%%ymm15)
|
||||
#define INIT_m2n1 "vpxor %%ymm4,%%ymm4,%%ymm4;"
|
||||
#define unit_init_m2n2(c1,c2) "vpxor "#c1","#c1","#c1"; vpxor "#c2","#c2","#c2";"
|
||||
#define INIT_m2n2 unit_init_m2n2(%%ymm4,%%ymm5)
|
||||
#define INIT_m2n4 INIT_m2n2 unit_init_m2n2(%%ymm6,%%ymm7)
|
||||
#define INIT_m2n6 INIT_m2n4 unit_init_m2n2(%%ymm8,%%ymm9)
|
||||
#define INIT_m2n8 INIT_m2n6 unit_init_m2n2(%%ymm10,%%ymm11)
|
||||
#define INIT_m2n10 INIT_m2n8 unit_init_m2n2(%%ymm12,%%ymm13)
|
||||
#define INIT_m2n12 INIT_m2n10 unit_init_m2n2(%%ymm14,%%ymm15)
|
||||
#define COMPUTE_m2(ndim) \
|
||||
INIT_m2n##ndim\
|
||||
COMPUTE_INIT\
|
||||
#ndim"88220:\n\t"\
|
||||
"testq %6,%6; jz "#ndim"88221f;"\
|
||||
KERNEL_t_k1m2n##ndim\
|
||||
"decq %6; jmp "#ndim"88220b;"\
|
||||
#ndim"88221:\n\t"\
|
||||
SAVE_m2n##ndim
|
||||
|
||||
/* m=1, ymm0-ymm3 and ymm10-ymm15 for temporary use, ymm4-ymm9 for accumulators */
|
||||
#if ZGEMM_SKX_MODE == 0 || ZGEMM_SKX_MODE == 3 //conjg_a == conjg_b; ap = permilpd($5,a0)
|
||||
#define unit_kernel_k1m1n1(a0,ap,b_off_r,b_off_i,c1,...) \
|
||||
"vmovddup "#b_off_i"("#__VA_ARGS__"),%%xmm2; vfmaddsub231pd "#ap",%%xmm2,"#c1";"\
|
||||
"vmovddup "#b_off_r"("#__VA_ARGS__"),%%xmm2; vfmaddsub231pd "#a0",%%xmm2,"#c1";"
|
||||
#define unit_kernel_k1m1n2(a0,ap,b_off_r,b_off_i,c1,...) \
|
||||
"vmovddup "#b_off_i"("#__VA_ARGS__"),%%ymm2; vfmaddsub231pd "#ap",%%ymm2,"#c1";"\
|
||||
"vmovddup "#b_off_r"("#__VA_ARGS__"),%%ymm2; vfmaddsub231pd "#a0",%%ymm2,"#c1";"
|
||||
#else //conjg_a != conjg_b
|
||||
#define unit_kernel_k1m1n1(a0,ap,b_off_r,b_off_i,c1,...) \
|
||||
"vmovddup "#b_off_i"("#__VA_ARGS__"),%%xmm2; vfmsubadd231pd "#ap",%%xmm2,"#c1";"\
|
||||
"vmovddup "#b_off_r"("#__VA_ARGS__"),%%xmm2; vfmsubadd231pd "#a0",%%xmm2,"#c1";"
|
||||
#define unit_kernel_k1m1n2(a0,ap,b_off_r,b_off_i,c1,...) \
|
||||
"vmovddup "#b_off_i"("#__VA_ARGS__"),%%ymm2; vfmsubadd231pd "#ap",%%ymm2,"#c1";"\
|
||||
"vmovddup "#b_off_r"("#__VA_ARGS__"),%%ymm2; vfmsubadd231pd "#a0",%%ymm2,"#c1";"
|
||||
#endif
|
||||
#define KERNEL_h_k1m1n1 \
|
||||
"vmovupd (%0),%%xmm0; vpermilpd $5,%%xmm0,%%xmm1; addq $16,%0;"\
|
||||
unit_kernel_k1m1n1(%%xmm0,%%xmm1,0,8,%%xmm4,%1)
|
||||
#define KERNEL_t_k1m1n1 KERNEL_h_k1m1n1 "addq $16,%1;"
|
||||
#define KERNEL_h_k1m1n2 \
|
||||
"vbroadcastf128 (%0),%%ymm0; vpermilpd $5,%%ymm0,%%ymm1; addq $16,%0;"\
|
||||
unit_kernel_k1m1n2(%%ymm0,%%ymm1,0,8,%%ymm4,%1)
|
||||
#define KERNEL_t_k1m1n2 KERNEL_h_k1m1n2 "addq $32,%1;"
|
||||
#define KERNEL_h_k1m1n4 KERNEL_h_k1m1n2 unit_kernel_k1m1n2(%%ymm0,%%ymm1,0,8,%%ymm5,%1,%%r12,1)
|
||||
#define KERNEL_t_k1m1n4 KERNEL_h_k1m1n4 "addq $32,%1;"
|
||||
#define KERNEL_t_k1m1n6 KERNEL_h_k1m1n4 unit_kernel_k1m1n2(%%ymm0,%%ymm1,0,8,%%ymm6,%1,%%r12,2) "addq $32,%1;"
|
||||
#define KERNEL_h_k1m1n8 KERNEL_t_k1m1n6 unit_kernel_k1m1n2(%%ymm0,%%ymm1,0,8,%%ymm7,%%r15)
|
||||
#define KERNEL_t_k1m1n8 KERNEL_h_k1m1n8 "addq $32,%%r15;"
|
||||
#define KERNEL_h_k1m1n10 KERNEL_h_k1m1n8 unit_kernel_k1m1n2(%%ymm0,%%ymm1,0,8,%%ymm8,%%r15,%%r12,1)
|
||||
#define KERNEL_t_k1m1n10 KERNEL_h_k1m1n10 "addq $32,%%r15;"
|
||||
#define KERNEL_h_k1m1n12 KERNEL_h_k1m1n10 unit_kernel_k1m1n2(%%ymm0,%%ymm1,0,8,%%ymm9,%%r15,%%r12,2)
|
||||
#define KERNEL_t_k1m1n12 KERNEL_h_k1m1n12 "addq $32,%%r15;"
|
||||
#if ZGEMM_SKX_MODE == 0 || ZGEMM_SKX_MODE == 2 //not to do conjugation on a_block
|
||||
#define unit_save_m1n1(alp_r,alp_i,c1,...) \
|
||||
"vpermilpd $5,"#c1",%%xmm3; vfmaddsub213pd ("#__VA_ARGS__"),"#alp_i",%%xmm3;"\
|
||||
"vfmaddsub213pd %%xmm3,"#alp_r","#c1";vmovupd "#c1",("#__VA_ARGS__");"
|
||||
#define unit_save_m1n2(alp_r,alp_i,c1) \
|
||||
"vpermilpd $5,"#c1",%%ymm3; vmovupd (%3),%%xmm2; vinsertf128 $1,(%3,%4,1),%%ymm2,%%ymm2;"\
|
||||
"vfmaddsub213pd %%ymm2,"#alp_i",%%ymm3; vfmaddsub231pd "#c1","#alp_r",%%ymm3;"\
|
||||
"vmovupd %%xmm3,(%3); vextractf128 $1,%%ymm3,(%3,%4,1); leaq (%3,%4,2),%3;"
|
||||
#else //do conjugation on a_block
|
||||
#define unit_save_m1n1(alp_r,alp_i,c1,...) \
|
||||
"vpermilpd $5,"#c1",%%xmm3; vfmsubadd213pd ("#__VA_ARGS__"),"#alp_r","#c1";"\
|
||||
"vfmsubadd231pd %%xmm3,"#alp_i","#c1";vmovupd "#c1",("#__VA_ARGS__");"
|
||||
#define unit_save_m1n2(alp_r,alp_i,c1) \
|
||||
"vpermilpd $5,"#c1",%%ymm3; vmovupd (%3),%%xmm2; vinsertf128 $1,(%3,%4,1),%%ymm2,%%ymm2;"\
|
||||
"vfmsubadd213pd %%ymm2,"#alp_r","#c1"; vfmsubadd213pd "#c1","#alp_i",%%ymm3;"\
|
||||
"vmovupd %%xmm3,(%3); vextractf128 $1,%%ymm3,(%3,%4,1); leaq (%3,%4,2),%3;"
|
||||
#endif
|
||||
#define SAVE_SETUP_m1 "movq %2,%3; addq $16,%2; vbroadcastsd (%5),%%ymm0; vbroadcastsd 8(%5),%%ymm1;"
|
||||
#define SAVE_m1n1 SAVE_SETUP_m1 unit_save_m1n1(%%xmm0,%%xmm1,%%xmm4,%3)
|
||||
#define SAVE_m1n2 SAVE_SETUP_m1 unit_save_m1n2(%%ymm0,%%ymm1,%%ymm4)
|
||||
#define SAVE_m1n4 SAVE_m1n2 unit_save_m1n2(%%ymm0,%%ymm1,%%ymm5)
|
||||
#define SAVE_m1n6 SAVE_m1n4 unit_save_m1n2(%%ymm0,%%ymm1,%%ymm6)
|
||||
#define SAVE_m1n8 SAVE_m1n6 unit_save_m1n2(%%ymm0,%%ymm1,%%ymm7)
|
||||
#define SAVE_m1n10 SAVE_m1n8 unit_save_m1n2(%%ymm0,%%ymm1,%%ymm8)
|
||||
#define SAVE_m1n12 SAVE_m1n10 unit_save_m1n2(%%ymm0,%%ymm1,%%ymm9)
|
||||
#define INIT_m1n1 "vpxor %%xmm4,%%xmm4,%%xmm4;"
|
||||
#define INIT_m1n2 INIT_m2n1
|
||||
#define INIT_m1n4 INIT_m1n2 "vpxor %%ymm5,%%ymm5,%%ymm5;"
|
||||
#define INIT_m1n6 INIT_m1n4 "vpxor %%ymm6,%%ymm6,%%ymm6;"
|
||||
#define INIT_m1n8 INIT_m1n6 "vpxor %%ymm7,%%ymm7,%%ymm7;"
|
||||
#define INIT_m1n10 INIT_m1n8 "vpxor %%ymm8,%%ymm8,%%ymm8;"
|
||||
#define INIT_m1n12 INIT_m1n10 "vpxor %%ymm9,%%ymm9,%%ymm9;"
|
||||
#define COMPUTE_m1(ndim) \
|
||||
INIT_m1n##ndim\
|
||||
COMPUTE_INIT\
|
||||
#ndim"88110:\n\t"\
|
||||
"testq %6,%6; jz "#ndim"88111f;"\
|
||||
KERNEL_t_k1m1n##ndim\
|
||||
"decq %6; jmp "#ndim"88110b;"\
|
||||
#ndim"88111:\n\t"\
|
||||
SAVE_m1n##ndim
|
||||
|
||||
#define COMPUTE(ndim) {\
|
||||
b_pref = b_pointer + ndim * K * 2;\
|
||||
__asm__ __volatile__(\
|
||||
GENERAL_INIT\
|
||||
CONSTZMM_INIT\
|
||||
"cmpq $4,%7;jb 33101"#ndim"f;"\
|
||||
"33109"#ndim":\n\t"\
|
||||
COMPUTE_m4(ndim)\
|
||||
"subq $4,%7;cmpq $4,%7;jnb 33109"#ndim"b;"\
|
||||
"33101"#ndim":\n\t"\
|
||||
"cmpq $2,%7;jb 33102"#ndim"f;"\
|
||||
COMPUTE_m2(ndim)\
|
||||
"subq $2,%7;"\
|
||||
"33102"#ndim":\n\t"\
|
||||
"testq %7,%7;jz 33103"#ndim"f;"\
|
||||
COMPUTE_m1(ndim)\
|
||||
"33103"#ndim":\n\t"\
|
||||
GENERAL_RECOVER\
|
||||
:"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(c_store),"+r"(ldc_in_bytes),"+r"(constval),"+r"(K),"+r"(M),"+r"(b_pref)\
|
||||
::"r11","r12","r13","r14","r15","zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14",\
|
||||
"zmm15","zmm16","zmm17","zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31",\
|
||||
"cc","memory");\
|
||||
a_pointer -= M * K * 2; b_pointer += ndim * K * 2; c_pointer += (LDC * ndim - M) * 2;\
|
||||
}
|
||||
|
||||
int __attribute__ ((noinline))
|
||||
CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alphar, double alphai, double * __restrict__ A, double * __restrict__ B, double * __restrict__ C, BLASLONG LDC)
|
||||
{
|
||||
if(m==0||n==0||k==0) return 0;
|
||||
int64_t ldc_in_bytes = (int64_t)LDC * sizeof(double) * 2; double const_val[4] = {alphar, alphai, -1, 1};
|
||||
int64_t M = (int64_t)m, K = (int64_t)k;
|
||||
BLASLONG n_count = n;
|
||||
double *a_pointer = A,*b_pointer = B,*c_pointer = C,*c_store = C,*constval = const_val,*b_pref = B;
|
||||
for(;n_count>11;n_count-=12) COMPUTE(12)
|
||||
for(;n_count>9;n_count-=10) COMPUTE(10)
|
||||
for(;n_count>7;n_count-=8) COMPUTE(8)
|
||||
for(;n_count>5;n_count-=6) COMPUTE(6)
|
||||
for(;n_count>3;n_count-=4) COMPUTE(4)
|
||||
for(;n_count>1;n_count-=2) COMPUTE(2)
|
||||
if(n_count>0) COMPUTE(1)
|
||||
return 0;
|
||||
}
|
||||
@@ -88,7 +88,7 @@ static FLOAT csum_kernel_32(BLASLONG n, FLOAT *x) {
|
||||
"vfasb %%v24,%%v24,%%v25\n\t"
|
||||
"vrepf %%v25,%%v24,2\n\t"
|
||||
"vfasb %%v24,%%v24,%%v25\n\t"
|
||||
"vstef %%v24,%[asum],0"
|
||||
"vstef %%v24,%[sum],0"
|
||||
: [sum] "=Q"(sum),[n] "+&r"(n)
|
||||
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
|
||||
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
|
||||
|
||||
@@ -86,7 +86,7 @@ static FLOAT dsum_kernel_32(BLASLONG n, FLOAT *x) {
|
||||
"vfadb %%v24,%%v24,%%v31\n\t"
|
||||
"vrepg %%v25,%%v24,1\n\t"
|
||||
"vfadb %%v24,%%v24,%%v25\n\t"
|
||||
"vsteg %%v24,%[asum],0"
|
||||
"vsteg %%v24,%[sum],0"
|
||||
: [sum] "=Q"(sum),[n] "+&r"(n)
|
||||
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
|
||||
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
|
||||
|
||||
@@ -89,7 +89,7 @@ static FLOAT ssum_kernel_64(BLASLONG n, FLOAT *x) {
|
||||
"vfasb %%v24,%%v24,%%v25\n\t"
|
||||
"vrepf %%v25,%%v24,2\n\t"
|
||||
"vfasb %%v24,%%v24,%%v25\n\t"
|
||||
"vstef %%v24,%[asum],0"
|
||||
"vstef %%v24,%[sum],0"
|
||||
: [sum] "=Q"(sum),[n] "+&r"(n)
|
||||
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
|
||||
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
|
||||
|
||||
@@ -87,7 +87,7 @@ static FLOAT zsum_kernel_16(BLASLONG n, FLOAT *x) {
|
||||
"vfadb %%v24,%%v24,%%v31\n\t"
|
||||
"vrepg %%v25,%%v24,1\n\t"
|
||||
"vfadb %%v24,%%v24,%%v25\n\t"
|
||||
"vsteg %%v24,%[asum],0"
|
||||
"vsteg %%v24,%[sum],0"
|
||||
: [sum] "=Q"(sum),[n] "+&r"(n)
|
||||
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
|
||||
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
|
||||
|
||||
79
param.h
79
param.h
@@ -1696,11 +1696,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define ZGEMM_DEFAULT_P 256
|
||||
|
||||
#ifdef WINDOWS_ABI
|
||||
#define SGEMM_DEFAULT_Q 320
|
||||
#define SGEMM_DEFAULT_Q 192
|
||||
#define DGEMM_DEFAULT_Q 128
|
||||
#else
|
||||
#define SGEMM_DEFAULT_Q 384
|
||||
#define DGEMM_DEFAULT_Q 256
|
||||
#define SGEMM_DEFAULT_Q 192
|
||||
#define DGEMM_DEFAULT_Q 128
|
||||
#endif
|
||||
#define CGEMM_DEFAULT_Q 192
|
||||
#define ZGEMM_DEFAULT_Q 128
|
||||
@@ -1990,11 +1990,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define GEMM_DEFAULT_OFFSET_B 3072
|
||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
||||
|
||||
#if defined(__BYTE_ORDER__)&&(__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
#define SGEMM_DEFAULT_UNROLL_M 4
|
||||
#else
|
||||
#define SGEMM_DEFAULT_UNROLL_M 16
|
||||
#endif
|
||||
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||
#define DGEMM_DEFAULT_UNROLL_M 4
|
||||
#define DGEMM_DEFAULT_UNROLL_N 4
|
||||
#if defined(__BYTE_ORDER__)&&(__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
#define CGEMM_DEFAULT_UNROLL_M 2
|
||||
#else
|
||||
#define CGEMM_DEFAULT_UNROLL_M 8
|
||||
#endif
|
||||
#define CGEMM_DEFAULT_UNROLL_N 2
|
||||
#define ZGEMM_DEFAULT_UNROLL_M 2
|
||||
#define ZGEMM_DEFAULT_UNROLL_N 2
|
||||
@@ -2588,38 +2596,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#define SYMV_P 16
|
||||
|
||||
// Darwin / Cross
|
||||
#if defined(OS_DARWIN) && defined(CROSS)
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 2
|
||||
#define SGEMM_DEFAULT_UNROLL_N 2
|
||||
|
||||
#define DGEMM_DEFAULT_UNROLL_M 2
|
||||
#define DGEMM_DEFAULT_UNROLL_N 2
|
||||
|
||||
#define CGEMM_DEFAULT_UNROLL_M 2
|
||||
#define CGEMM_DEFAULT_UNROLL_N 2
|
||||
|
||||
#define ZGEMM_DEFAULT_UNROLL_M 2
|
||||
#define ZGEMM_DEFAULT_UNROLL_N 2
|
||||
|
||||
#define SGEMM_DEFAULT_P 128
|
||||
#define DGEMM_DEFAULT_P 128
|
||||
#define CGEMM_DEFAULT_P 96
|
||||
#define ZGEMM_DEFAULT_P 64
|
||||
|
||||
#define SGEMM_DEFAULT_Q 240
|
||||
#define DGEMM_DEFAULT_Q 120
|
||||
#define CGEMM_DEFAULT_Q 120
|
||||
#define ZGEMM_DEFAULT_Q 120
|
||||
|
||||
#define SGEMM_DEFAULT_R 12288
|
||||
#define DGEMM_DEFAULT_R 8192
|
||||
#define CGEMM_DEFAULT_R 4096
|
||||
#define ZGEMM_DEFAULT_R 4096
|
||||
|
||||
#else // Linux / Native
|
||||
|
||||
#if defined(CORTEXA53) || defined(CORTEXA57) || \
|
||||
defined(CORTEXA72) || defined(CORTEXA73) || \
|
||||
defined(FALKOR) || defined(TSV110)
|
||||
@@ -2636,15 +2612,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define ZGEMM_DEFAULT_UNROLL_M 4
|
||||
#define ZGEMM_DEFAULT_UNROLL_N 4
|
||||
|
||||
#define SGEMM_DEFAULT_P 512
|
||||
#define DGEMM_DEFAULT_P 256
|
||||
#define CGEMM_DEFAULT_P 256
|
||||
#define ZGEMM_DEFAULT_P 128
|
||||
/*FIXME: this should be using the cache size, but there is currently no easy way to
|
||||
query that on ARM. So if getarch counted more than 8 cores we simply assume the host
|
||||
is a big desktop or server with abundant cache rather than a phone or embedded device */
|
||||
#if NUM_CORES > 8
|
||||
#define SGEMM_DEFAULT_P 512
|
||||
#define DGEMM_DEFAULT_P 256
|
||||
#define CGEMM_DEFAULT_P 256
|
||||
#define ZGEMM_DEFAULT_P 128
|
||||
|
||||
#define SGEMM_DEFAULT_Q 1024
|
||||
#define DGEMM_DEFAULT_Q 512
|
||||
#define CGEMM_DEFAULT_Q 512
|
||||
#define ZGEMM_DEFAULT_Q 512
|
||||
#define SGEMM_DEFAULT_Q 1024
|
||||
#define DGEMM_DEFAULT_Q 512
|
||||
#define CGEMM_DEFAULT_Q 512
|
||||
#define ZGEMM_DEFAULT_Q 512
|
||||
#else
|
||||
#define SGEMM_DEFAULT_P 128
|
||||
#define DGEMM_DEFAULT_P 160
|
||||
#define CGEMM_DEFAULT_P 128
|
||||
#define ZGEMM_DEFAULT_P 128
|
||||
|
||||
#define SGEMM_DEFAULT_Q 352
|
||||
#define DGEMM_DEFAULT_Q 128
|
||||
#define CGEMM_DEFAULT_Q 224
|
||||
#define ZGEMM_DEFAULT_Q 112
|
||||
#endif
|
||||
|
||||
#define SGEMM_DEFAULT_R 4096
|
||||
#define DGEMM_DEFAULT_R 4096
|
||||
@@ -2740,8 +2731,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#endif // Cores
|
||||
|
||||
#endif // Linux / Darwin
|
||||
|
||||
#endif // ARMv8
|
||||
|
||||
#if defined(ARMV5)
|
||||
|
||||
Reference in New Issue
Block a user