commit
0257f26488
|
@ -171,3 +171,11 @@ In chronological order:
|
||||||
* [2019-02-01] added missing Blas Level-1,2 (single precision) simd codes
|
* [2019-02-01] added missing Blas Level-1,2 (single precision) simd codes
|
||||||
* [2019-03-14] power9 dgemm/dtrmm kernel
|
* [2019-03-14] power9 dgemm/dtrmm kernel
|
||||||
* [2019-04-29] power9 sgemm/strmm kernel
|
* [2019-04-29] power9 sgemm/strmm kernel
|
||||||
|
|
||||||
|
* Jiachen Wang <https://github.com/wjc404>
|
||||||
|
* [2019-07-29] optimize AVX2 DGEMM
|
||||||
|
* [2019-10-20] AVX512 DGEMM kernel (4x8)
|
||||||
|
* [2019-11-06] optimize AVX512 SGEMM
|
||||||
|
* [2019-11-12] AVX512 CGEMM & ZGEMM kernels
|
||||||
|
* [2019-12-23] optimize AVX2 CGEMM and ZGEMM
|
||||||
|
* [2019-12-27] AVX2 CGEMM3M kernel
|
||||||
|
|
|
@ -39,7 +39,10 @@ CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
||||||
FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(GCCVERSIONGTEQ9), 1)
|
||||||
ifeq ($(CORE), TSV110)
|
ifeq ($(CORE), TSV110)
|
||||||
CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
|
CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
|
||||||
FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
|
FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
|
||||||
endif
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
|
|
|
@ -326,6 +326,7 @@ ifeq ($(C_COMPILER), GCC)
|
||||||
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
|
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
|
||||||
GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4)
|
GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4)
|
||||||
GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5)
|
GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5)
|
||||||
|
GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
|
||||||
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
|
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
|
||||||
ifeq ($(GCCVERSIONGT4), 1)
|
ifeq ($(GCCVERSIONGT4), 1)
|
||||||
# GCC Major version > 4
|
# GCC Major version > 4
|
||||||
|
@ -547,9 +548,14 @@ endif
|
||||||
|
|
||||||
ifeq ($(ARCH), arm64)
|
ifeq ($(ARCH), arm64)
|
||||||
DYNAMIC_CORE = ARMV8
|
DYNAMIC_CORE = ARMV8
|
||||||
|
DYNAMIC_CORE += CORTEXA53
|
||||||
DYNAMIC_CORE += CORTEXA57
|
DYNAMIC_CORE += CORTEXA57
|
||||||
|
DYNAMIC_CORE += CORTEXA72
|
||||||
|
DYNAMIC_CORE += CORTEXA73
|
||||||
|
DYNAMIC_CORE += FALKOR
|
||||||
DYNAMIC_CORE += THUNDERX
|
DYNAMIC_CORE += THUNDERX
|
||||||
DYNAMIC_CORE += THUNDERX2T99
|
DYNAMIC_CORE += THUNDERX2T99
|
||||||
|
DYNAMIC_CORE += TSV110
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(ARCH), power)
|
ifeq ($(ARCH), power)
|
||||||
|
|
|
@ -45,7 +45,11 @@ endif ()
|
||||||
|
|
||||||
if (DYNAMIC_ARCH)
|
if (DYNAMIC_ARCH)
|
||||||
if (ARM64)
|
if (ARM64)
|
||||||
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99)
|
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110)
|
||||||
|
endif ()
|
||||||
|
|
||||||
|
if (POWER)
|
||||||
|
set(DYNAMIC_CORE POWER6 POWER8 POWER9)
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
if (X86)
|
if (X86)
|
||||||
|
|
|
@ -309,6 +309,83 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
|
||||||
set(ZGEMM_UNROLL_M 4)
|
set(ZGEMM_UNROLL_M 4)
|
||||||
set(ZGEMM_UNROLL_N 4)
|
set(ZGEMM_UNROLL_N 4)
|
||||||
set(SYMV_P 16)
|
set(SYMV_P 16)
|
||||||
|
elseif ("${TCORE}" STREQUAL "TSV110")
|
||||||
|
file(APPEND ${TARGET_CONF_TEMP}
|
||||||
|
"#define ARMV8\n"
|
||||||
|
"#define L1_CODE_SIZE\t65536\n"
|
||||||
|
"#define L1_CODE_LINESIZE\t64\n"
|
||||||
|
"#define L1_CODE_ASSOCIATIVE\t4\n"
|
||||||
|
"#define L1_DATA_SIZE\t65536\n"
|
||||||
|
"#define L1_DATA_LINESIZE\t64\n"
|
||||||
|
"#define L1_DATA_ASSOCIATIVE\t4\n"
|
||||||
|
"#define L2_SIZE\t524288\n"
|
||||||
|
"#define L2_LINESIZE\t64\n"
|
||||||
|
"#define L2_ASSOCIATIVE\t8\n"
|
||||||
|
"#define DTB_DEFAULT_ENTRIES\t64\n"
|
||||||
|
"#define DTB_SIZE\t4096\n")
|
||||||
|
set(SGEMM_UNROLL_M 16)
|
||||||
|
set(SGEMM_UNROLL_N 4)
|
||||||
|
set(DGEMM_UNROLL_M 8)
|
||||||
|
set(DGEMM_UNROLL_N 4)
|
||||||
|
set(CGEMM_UNROLL_M 8)
|
||||||
|
set(CGEMM_UNROLL_N 4)
|
||||||
|
set(ZGEMM_UNROLL_M 4)
|
||||||
|
set(ZGEMM_UNROLL_N 4)
|
||||||
|
set(SYMV_P 16)
|
||||||
|
elseif ("${TCORE}" STREQUAL "POWER6")
|
||||||
|
file(APPEND ${TARGET_CONF_TEMP}
|
||||||
|
"#define L1_DATA_SIZE 32768\n"
|
||||||
|
"#define L1_DATA_LINESIZE 128\n"
|
||||||
|
"#define L2_SIZE 524288\n"
|
||||||
|
"#define L2_LINESIZE 128 \n"
|
||||||
|
"#define DTB_DEFAULT_ENTRIES 128\n"
|
||||||
|
"#define DTB_SIZE 4096\n"
|
||||||
|
"#define L2_ASSOCIATIVE 8\n")
|
||||||
|
set(SGEMM_UNROLL_M 4)
|
||||||
|
set(SGEMM_UNROLL_N 4)
|
||||||
|
set(DGEMM_UNROLL_M 4)
|
||||||
|
set(DGEMM_UNROLL_N 4)
|
||||||
|
set(CGEMM_UNROLL_M 2)
|
||||||
|
set(CGEMM_UNROLL_N 4)
|
||||||
|
set(ZGEMM_UNROLL_M 2)
|
||||||
|
set(ZGEMM_UNROLL_N 4)
|
||||||
|
set(SYMV_P 8)
|
||||||
|
elseif ("${TCORE}" STREQUAL "POWER8")
|
||||||
|
file(APPEND ${TARGET_CONF_TEMP}
|
||||||
|
"#define L1_DATA_SIZE 32768\n"
|
||||||
|
"#define L1_DATA_LINESIZE 128\n"
|
||||||
|
"#define L2_SIZE 524288\n"
|
||||||
|
"#define L2_LINESIZE 128 \n"
|
||||||
|
"#define DTB_DEFAULT_ENTRIES 128\n"
|
||||||
|
"#define DTB_SIZE 4096\n"
|
||||||
|
"#define L2_ASSOCIATIVE 8\n")
|
||||||
|
set(SGEMM_UNROLL_M 16)
|
||||||
|
set(SGEMM_UNROLL_N 8)
|
||||||
|
set(DGEMM_UNROLL_M 16)
|
||||||
|
set(DGEMM_UNROLL_N 4)
|
||||||
|
set(CGEMM_UNROLL_M 8)
|
||||||
|
set(CGEMM_UNROLL_N 4)
|
||||||
|
set(ZGEMM_UNROLL_M 8)
|
||||||
|
set(ZGEMM_UNROLL_N 2)
|
||||||
|
set(SYMV_P 8)
|
||||||
|
elseif ("${TCORE}" STREQUAL "POWER9")
|
||||||
|
file(APPEND ${TARGET_CONF_TEMP}
|
||||||
|
"#define L1_DATA_SIZE 32768\n"
|
||||||
|
"#define L1_DATA_LINESIZE 128\n"
|
||||||
|
"#define L2_SIZE 524288\n"
|
||||||
|
"#define L2_LINESIZE 128 \n"
|
||||||
|
"#define DTB_DEFAULT_ENTRIES 128\n"
|
||||||
|
"#define DTB_SIZE 4096\n"
|
||||||
|
"#define L2_ASSOCIATIVE 8\n")
|
||||||
|
set(SGEMM_UNROLL_M 16)
|
||||||
|
set(SGEMM_UNROLL_N 8)
|
||||||
|
set(DGEMM_UNROLL_M 16)
|
||||||
|
set(DGEMM_UNROLL_N 4)
|
||||||
|
set(CGEMM_UNROLL_M 8)
|
||||||
|
set(CGEMM_UNROLL_N 4)
|
||||||
|
set(ZGEMM_UNROLL_M 8)
|
||||||
|
set(ZGEMM_UNROLL_N 2)
|
||||||
|
set(SYMV_P 8)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Or should this actually be NUM_CORES?
|
# Or should this actually be NUM_CORES?
|
||||||
|
|
|
@ -39,6 +39,35 @@
|
||||||
#ifndef COMMON_POWER
|
#ifndef COMMON_POWER
|
||||||
#define COMMON_POWER
|
#define COMMON_POWER
|
||||||
|
|
||||||
|
#define str(x) #x
|
||||||
|
|
||||||
|
#ifdef OS_AIX
|
||||||
|
#define XXSPLTD(T,A,z) xxpermdi T, A, A, 0b##z##z
|
||||||
|
#define XXMRGHD(T,A,B) xxpermdi T, A, B, 0b00
|
||||||
|
#define XXMRGLD(T,A,B) xxpermdi T, A, B, 0b11
|
||||||
|
#define XXSWAPD(T,A) xxpermdi T, A, A, 0b10
|
||||||
|
#define XVMOVDP(T,A) xvcpsgndp T, A, A
|
||||||
|
|
||||||
|
#define XXSPLTD_S(T,A,z) "xxpermdi " str(T) ", " str(A) ", " str(A) ", 0b" str(z ## z) " \n\t"
|
||||||
|
#define XXMRGHD_S(T,A,B) "xxpermdi " str(T) ", " str(A) ", " str(B) ", 0b00 \n\t"
|
||||||
|
#define XXMRGLD_S(T,A,B) "xxpermdi " str(T) ", " str(A) ", " str(B) ", 0b11 \n\t"
|
||||||
|
#define XXSWAPD_S(T,A) "xxpermdi " str(T) ", " str(A) ", " str(A) ", 0b10 \n\t"
|
||||||
|
|
||||||
|
#else
|
||||||
|
#define XXSPLTD(T,A,z) xxspltd T, A, z
|
||||||
|
#define XXMRGHD(T,A,B) xxmrghd T, A, B
|
||||||
|
#define XXMRGLD(T,A,B) xxmrgld T, A, B
|
||||||
|
#define XXSWAPD(T,A) xxswapd T, A
|
||||||
|
#define XVMOVDP(T,A) xvmovdp T, A
|
||||||
|
|
||||||
|
#define XXSPLTD_S(T,A,z) "xxspltd " str(T) ", " str(A) ", " str(z)" \n\t"
|
||||||
|
#define XXMRGHD_S(T,A,B) "xxmrghd " str(T) ", " str(A) ", " str(B)" \n\t"
|
||||||
|
#define XXMRGLD_S(T,A,B) "xxmrgld " str(T) ", " str(A) ", " str(B)" \n\t"
|
||||||
|
#define XXSWAPD_S(T,A) "xxswapd " str(T) ", " str(A) " \n\t"
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#if defined(POWER8) || defined(POWER9)
|
#if defined(POWER8) || defined(POWER9)
|
||||||
#define MB __asm__ __volatile__ ("eieio":::"memory")
|
#define MB __asm__ __volatile__ ("eieio":::"memory")
|
||||||
#define WMB __asm__ __volatile__ ("eieio":::"memory")
|
#define WMB __asm__ __volatile__ ("eieio":::"memory")
|
||||||
|
|
|
@ -338,7 +338,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
|
|
||||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||||
min_jj = min_j + js - jjs;
|
min_jj = min_j + js - jjs;
|
||||||
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
|
if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
|
||||||
|
|
||||||
START_RPCC();
|
START_RPCC();
|
||||||
|
|
||||||
|
@ -398,7 +398,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
|
|
||||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||||
min_jj = min_j + js - jjs;
|
min_jj = min_j + js - jjs;
|
||||||
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
|
if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
|
||||||
|
|
||||||
START_RPCC();
|
START_RPCC();
|
||||||
|
|
||||||
|
@ -463,7 +463,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
|
|
||||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||||
min_jj = min_j + js - jjs;
|
min_jj = min_j + js - jjs;
|
||||||
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
|
if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
|
||||||
|
|
||||||
START_RPCC();
|
START_RPCC();
|
||||||
|
|
||||||
|
|
|
@ -462,7 +462,7 @@ int BLASFUNC(blas_thread_shutdown)(void){
|
||||||
|
|
||||||
for(i = 0; i < blas_num_threads - 1; i++){
|
for(i = 0; i < blas_num_threads - 1; i++){
|
||||||
// Could also just use WaitForMultipleObjects
|
// Could also just use WaitForMultipleObjects
|
||||||
DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 5000);
|
DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 50);
|
||||||
|
|
||||||
#ifndef OS_WINDOWSSTORE
|
#ifndef OS_WINDOWSSTORE
|
||||||
// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
|
// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
|
||||||
|
|
|
@ -586,6 +586,8 @@ static gotoblas_t *get_coretype(void){
|
||||||
}
|
}
|
||||||
return NULL;
|
return NULL;
|
||||||
case 7:
|
case 7:
|
||||||
|
if (model == 10) // Goldmont Plus
|
||||||
|
return &gotoblas_NEHALEM;
|
||||||
if (model == 14) {
|
if (model == 14) {
|
||||||
// Ice Lake
|
// Ice Lake
|
||||||
if (support_avx512())
|
if (support_avx512())
|
||||||
|
|
|
@ -43,13 +43,18 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
extern gotoblas_t gotoblas_ARMV8;
|
extern gotoblas_t gotoblas_ARMV8;
|
||||||
|
extern gotoblas_t gotoblas_CORTEXA53;
|
||||||
extern gotoblas_t gotoblas_CORTEXA57;
|
extern gotoblas_t gotoblas_CORTEXA57;
|
||||||
|
extern gotoblas_t gotoblas_CORTEXA72;
|
||||||
|
extern gotoblas_t gotoblas_CORTEXA73;
|
||||||
|
extern gotoblas_t gotoblas_FALKOR;
|
||||||
extern gotoblas_t gotoblas_THUNDERX;
|
extern gotoblas_t gotoblas_THUNDERX;
|
||||||
extern gotoblas_t gotoblas_THUNDERX2T99;
|
extern gotoblas_t gotoblas_THUNDERX2T99;
|
||||||
|
extern gotoblas_t gotoblas_TSV110;
|
||||||
|
|
||||||
extern void openblas_warning(int verbose, const char * msg);
|
extern void openblas_warning(int verbose, const char * msg);
|
||||||
|
|
||||||
#define NUM_CORETYPES 4
|
#define NUM_CORETYPES 9
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* In case asm/hwcap.h is outdated on the build system, make sure
|
* In case asm/hwcap.h is outdated on the build system, make sure
|
||||||
|
@ -65,17 +70,27 @@ extern void openblas_warning(int verbose, const char * msg);
|
||||||
|
|
||||||
static char *corename[] = {
|
static char *corename[] = {
|
||||||
"armv8",
|
"armv8",
|
||||||
|
"cortexa53",
|
||||||
"cortexa57",
|
"cortexa57",
|
||||||
|
"cortexa72",
|
||||||
|
"cortexa73",
|
||||||
|
"falkor",
|
||||||
"thunderx",
|
"thunderx",
|
||||||
"thunderx2t99",
|
"thunderx2t99",
|
||||||
|
"tsv110",
|
||||||
"unknown"
|
"unknown"
|
||||||
};
|
};
|
||||||
|
|
||||||
char *gotoblas_corename(void) {
|
char *gotoblas_corename(void) {
|
||||||
if (gotoblas == &gotoblas_ARMV8) return corename[ 0];
|
if (gotoblas == &gotoblas_ARMV8) return corename[ 0];
|
||||||
if (gotoblas == &gotoblas_CORTEXA57) return corename[ 1];
|
if (gotoblas == &gotoblas_CORTEXA53) return corename[ 1];
|
||||||
if (gotoblas == &gotoblas_THUNDERX) return corename[ 2];
|
if (gotoblas == &gotoblas_CORTEXA57) return corename[ 2];
|
||||||
if (gotoblas == &gotoblas_THUNDERX2T99) return corename[ 3];
|
if (gotoblas == &gotoblas_CORTEXA72) return corename[ 3];
|
||||||
|
if (gotoblas == &gotoblas_CORTEXA73) return corename[ 4];
|
||||||
|
if (gotoblas == &gotoblas_FALKOR) return corename[ 5];
|
||||||
|
if (gotoblas == &gotoblas_THUNDERX) return corename[ 6];
|
||||||
|
if (gotoblas == &gotoblas_THUNDERX2T99) return corename[ 7];
|
||||||
|
if (gotoblas == &gotoblas_TSV110) return corename[ 8];
|
||||||
return corename[NUM_CORETYPES];
|
return corename[NUM_CORETYPES];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -96,9 +111,14 @@ static gotoblas_t *force_coretype(char *coretype) {
|
||||||
switch (found)
|
switch (found)
|
||||||
{
|
{
|
||||||
case 0: return (&gotoblas_ARMV8);
|
case 0: return (&gotoblas_ARMV8);
|
||||||
case 1: return (&gotoblas_CORTEXA57);
|
case 1: return (&gotoblas_CORTEXA53);
|
||||||
case 2: return (&gotoblas_THUNDERX);
|
case 2: return (&gotoblas_CORTEXA57);
|
||||||
case 3: return (&gotoblas_THUNDERX2T99);
|
case 3: return (&gotoblas_CORTEXA72);
|
||||||
|
case 4: return (&gotoblas_CORTEXA73);
|
||||||
|
case 5: return (&gotoblas_FALKOR);
|
||||||
|
case 6: return (&gotoblas_THUNDERX);
|
||||||
|
case 7: return (&gotoblas_THUNDERX2T99);
|
||||||
|
case 8: return (&gotoblas_TSV110);
|
||||||
}
|
}
|
||||||
snprintf(message, 128, "Core not found: %s\n", coretype);
|
snprintf(message, 128, "Core not found: %s\n", coretype);
|
||||||
openblas_warning(1, message);
|
openblas_warning(1, message);
|
||||||
|
@ -136,10 +156,14 @@ static gotoblas_t *get_coretype(void) {
|
||||||
case 0x41: // ARM
|
case 0x41: // ARM
|
||||||
switch (part)
|
switch (part)
|
||||||
{
|
{
|
||||||
case 0xd07: // Cortex A57
|
|
||||||
case 0xd08: // Cortex A72
|
|
||||||
case 0xd03: // Cortex A53
|
case 0xd03: // Cortex A53
|
||||||
|
return &gotoblas_CORTEXA53;
|
||||||
|
case 0xd07: // Cortex A57
|
||||||
return &gotoblas_CORTEXA57;
|
return &gotoblas_CORTEXA57;
|
||||||
|
case 0xd08: // Cortex A72
|
||||||
|
return &gotoblas_CORTEXA72;
|
||||||
|
case 0xd09: // Cortex A73
|
||||||
|
return &gotoblas_CORTEXA73;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case 0x42: // Broadcom
|
case 0x42: // Broadcom
|
||||||
|
@ -158,6 +182,20 @@ static gotoblas_t *get_coretype(void) {
|
||||||
return &gotoblas_THUNDERX2T99;
|
return &gotoblas_THUNDERX2T99;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
case 0x48: // HiSilicon
|
||||||
|
switch (part)
|
||||||
|
{
|
||||||
|
case 0xd01: // tsv110
|
||||||
|
return &gotoblas_TSV110;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case 0x51: // Qualcomm
|
||||||
|
switch (part)
|
||||||
|
{
|
||||||
|
case 0xc00: // Falkor
|
||||||
|
return &gotoblas_FALKOR;
|
||||||
|
}
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
897
dynamic.c
897
dynamic.c
|
@ -1,897 +0,0 @@
|
||||||
/*********************************************************************/
|
|
||||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
||||||
/* All rights reserved. */
|
|
||||||
/* */
|
|
||||||
/* Redistribution and use in source and binary forms, with or */
|
|
||||||
/* without modification, are permitted provided that the following */
|
|
||||||
/* conditions are met: */
|
|
||||||
/* */
|
|
||||||
/* 1. Redistributions of source code must retain the above */
|
|
||||||
/* copyright notice, this list of conditions and the following */
|
|
||||||
/* disclaimer. */
|
|
||||||
/* */
|
|
||||||
/* 2. Redistributions in binary form must reproduce the above */
|
|
||||||
/* copyright notice, this list of conditions and the following */
|
|
||||||
/* disclaimer in the documentation and/or other materials */
|
|
||||||
/* provided with the distribution. */
|
|
||||||
/* */
|
|
||||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
||||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
||||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
||||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
||||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
||||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
||||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
||||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
||||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
||||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
||||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
||||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
||||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
||||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
|
||||||
/* */
|
|
||||||
/* The views and conclusions contained in the software and */
|
|
||||||
/* documentation are those of the authors and should not be */
|
|
||||||
/* interpreted as representing official policies, either expressed */
|
|
||||||
/* or implied, of The University of Texas at Austin. */
|
|
||||||
/*********************************************************************/
|
|
||||||
|
|
||||||
#include "common.h"
|
|
||||||
|
|
||||||
#ifdef _MSC_VER
|
|
||||||
#define strncasecmp _strnicmp
|
|
||||||
#define strcasecmp _stricmp
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef ARCH_X86
|
|
||||||
#define EXTERN extern
|
|
||||||
#else
|
|
||||||
#define EXTERN
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef DYNAMIC_LIST
|
|
||||||
extern gotoblas_t gotoblas_PRESCOTT;
|
|
||||||
|
|
||||||
#ifdef DYN_ATHLON
|
|
||||||
extern gotoblas_t gotoblas_ATHLON;
|
|
||||||
#else
|
|
||||||
#define gotoblas_ATHLON gotoblas_PRESCOTT
|
|
||||||
#endif
|
|
||||||
#ifdef DYN_KATMAI
|
|
||||||
extern gotoblas_t gotoblas_KATMAI;
|
|
||||||
#else
|
|
||||||
#define gotoblas_KATMAI gotoblas_PRESCOTT
|
|
||||||
#endif
|
|
||||||
#ifdef DYN_BANIAS
|
|
||||||
extern gotoblas_t gotoblas_BANIAS;
|
|
||||||
#else
|
|
||||||
#define gotoblas_BANIAS gotoblas_PRESCOTT
|
|
||||||
#endif
|
|
||||||
#ifdef DYN_COPPERMINE
|
|
||||||
extern gotoblas_t gotoblas_COPPERMINE;
|
|
||||||
#else
|
|
||||||
#define gotoblas_COPPERMINE gotoblas_PRESCOTT
|
|
||||||
#endif
|
|
||||||
#ifdef DYN_NORTHWOOD
|
|
||||||
extern gotoblas_t gotoblas_NORTHWOOD;
|
|
||||||
#else
|
|
||||||
#define gotoblas_NORTHWOOD gotoblas_PRESCOTT
|
|
||||||
#endif
|
|
||||||
#ifdef DYN_CORE2
|
|
||||||
extern gotoblas_t gotoblas_CORE2;
|
|
||||||
#else
|
|
||||||
#define gotoblas_CORE2 gotoblas_PRESCOTT
|
|
||||||
#endif
|
|
||||||
#ifdef DYN_NEHALEM
|
|
||||||
extern gotoblas_t gotoblas_NEHALEM;
|
|
||||||
#else
|
|
||||||
#define gotoblas_NEHALEM gotoblas_PRESCOTT
|
|
||||||
#endif
|
|
||||||
#ifdef DYN_BARCELONA
|
|
||||||
extern gotoblas_t gotoblas_BARCELONA;
|
|
||||||
#elif defined(DYN_NEHALEM)
|
|
||||||
#define gotoblas_BARCELONA gotoblas_NEHALEM
|
|
||||||
#else
|
|
||||||
#define gotoblas_BARCELONA gotoblas_PRESCOTT
|
|
||||||
#endif
|
|
||||||
#ifdef DYN_ATOM
|
|
||||||
extern gotoblas_t gotoblas_ATOM;
|
|
||||||
elif defined(DYN_NEHALEM)
|
|
||||||
#define gotoblas_ATOM gotoblas_NEHALEM
|
|
||||||
#else
|
|
||||||
#define gotoblas_ATOM gotoblas_PRESCOTT
|
|
||||||
#endif
|
|
||||||
#ifdef DYN_NANO
|
|
||||||
extern gotoblas_t gotoblas_NANO;
|
|
||||||
#else
|
|
||||||
#define gotoblas_NANO gotoblas_PRESCOTT
|
|
||||||
#endif
|
|
||||||
#ifdef DYN_PENRYN
|
|
||||||
extern gotoblas_t gotoblas_PENRYN;
|
|
||||||
#else
|
|
||||||
#define gotoblas_PENRYN gotoblas_PRESCOTT
|
|
||||||
#endif
|
|
||||||
#ifdef DYN_DUNNINGTON
|
|
||||||
extern gotoblas_t gotoblas_DUNNINGTON;
|
|
||||||
#else
|
|
||||||
#define gotoblas_DUNNINGTON gotoblas_PRESCOTT
|
|
||||||
#endif
|
|
||||||
#ifdef DYN_OPTERON
|
|
||||||
extern gotoblas_t gotoblas_OPTERON;
|
|
||||||
#else
|
|
||||||
#define gotoblas_OPTERON gotoblas_PRESCOTT
|
|
||||||
#endif
|
|
||||||
#ifdef DYN_OPTERON_SSE3
|
|
||||||
extern gotoblas_t gotoblas_OPTERON_SSE3;
|
|
||||||
#else
|
|
||||||
#define gotoblas_OPTERON_SSE3 gotoblas_PRESCOTT
|
|
||||||
#endif
|
|
||||||
#ifdef DYN_BOBCAT
|
|
||||||
extern gotoblas_t gotoblas_BOBCAT;
|
|
||||||
#elif defined(DYN_NEHALEM)
|
|
||||||
#define gotoblas_BOBCAT gotoblas_NEHALEM
|
|
||||||
#else
|
|
||||||
#define gotoblas_BOBCAT gotoblas_PRESCOTT
|
|
||||||
#endif
|
|
||||||
#ifdef DYN_SANDYBRIDGE
|
|
||||||
extern gotoblas_t gotoblas_SANDYBRIDGE;
|
|
||||||
#elif defined(DYN_NEHALEM)
|
|
||||||
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
|
|
||||||
#else
|
|
||||||
#define gotoblas_SANDYBRIDGE gotoblas_PRESCOTT
|
|
||||||
#endif
|
|
||||||
#ifdef DYN_BULLDOZER
|
|
||||||
extern gotoblas_t gotoblas_BULLDOZER;
|
|
||||||
#elif defined(DYN_SANDYBRIDGE)
|
|
||||||
#define gotoblas_BULLDOZER gotoblas_SANDYBRIDGE
|
|
||||||
#elif defined(DYN_NEHALEM)
|
|
||||||
#define gotoblas_BULLDOZER gotoblas_NEHALEM
|
|
||||||
#else
|
|
||||||
#define gotoblas_BULLDOZER gotoblas_PRESCOTT
|
|
||||||
#endif
|
|
||||||
#ifdef DYN_PILEDRIVER
|
|
||||||
extern gotoblas_t gotoblas_PILEDRIVER;
|
|
||||||
#elif defined(DYN_SANDYBRIDGE)
|
|
||||||
#define gotoblas_PILEDRIVER gotoblas_SANDYBRIDGE
|
|
||||||
#elif defined(DYN_NEHALEM)
|
|
||||||
#define gotoblas_PILEDRIVER gotoblas_NEHALEM
|
|
||||||
#else
|
|
||||||
#define gotoblas_PILEDRIVER gotoblas_PRESCOTT
|
|
||||||
#endif
|
|
||||||
#ifdef DYN_STEAMROLLER
|
|
||||||
extern gotoblas_t gotoblas_STEAMROLLER;
|
|
||||||
#elif defined(DYN_SANDYBRIDGE)
|
|
||||||
#define gotoblas_STEAMROLLER gotoblas_SANDYBRIDGE
|
|
||||||
#elif defined(DYN_NEHALEM)
|
|
||||||
#define gotoblas_STEAMROLLER gotoblas_NEHALEM
|
|
||||||
#else
|
|
||||||
#define gotoblas_STEAMROLLER gotoblas_PRESCOTT
|
|
||||||
#endif
|
|
||||||
#ifdef DYN_EXCAVATOR
|
|
||||||
extern gotoblas_t gotoblas_EXCAVATOR;
|
|
||||||
#elif defined(DYN_SANDYBRIDGE)
|
|
||||||
#define gotoblas_EXCAVATOR gotoblas_SANDYBRIDGE
|
|
||||||
#elif defined(DYN_NEHALEM)
|
|
||||||
#define gotoblas_EXCAVATOR gotoblas_NEHALEM
|
|
||||||
#else
|
|
||||||
#define gotoblas_EXCAVATOR gotoblas_PRESCOTT
|
|
||||||
#endif
|
|
||||||
#ifdef DYN_HASWELL
|
|
||||||
extern gotoblas_t gotoblas_HASWELL;
|
|
||||||
#elif defined(DYN_SANDYBRIDGE)
|
|
||||||
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
|
|
||||||
#elif defined(DYN_NEHALEM)
|
|
||||||
#define gotoblas_HASWELL gotoblas_NEHALEM
|
|
||||||
#else
|
|
||||||
#define gotoblas_HASWELL gotoblas_PRESCOTT
|
|
||||||
#endif
|
|
||||||
#ifdef DYN_ZEN
|
|
||||||
extern gotoblas_t gotoblas_ZEN;
|
|
||||||
#elif defined(DYN_HASWELL)
|
|
||||||
#define gotoblas_ZEN gotoblas_HASWELL
|
|
||||||
#elif defined(DYN_SANDYBRIDGE)
|
|
||||||
#define gotoblas_ZEN gotoblas_SANDYBRIDGE
|
|
||||||
#elif defined(DYN_NEHALEM)
|
|
||||||
#define gotoblas_ZEN gotoblas_NEHALEM
|
|
||||||
#else
|
|
||||||
#define gotoblas_ZEN gotoblas_PRESCOTT
|
|
||||||
#endif
|
|
||||||
#ifdef DYN_SKYLAKEX
|
|
||||||
extern gotoblas_t gotoblas_SKYLAKEX;
|
|
||||||
#elif defined(DYN_HASWELL)
|
|
||||||
#define gotoblas_SKYLAKEX gotoblas_HASWELL
|
|
||||||
#elif defined(DYN_SANDYBRIDGE)
|
|
||||||
#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE
|
|
||||||
#elif defined(DYN_NEHALEM)
|
|
||||||
#define gotoblas_SKYLAKEX gotoblas_NEHALEM
|
|
||||||
#else
|
|
||||||
#define gotoblas_SKYLAKEX gotoblas_PRESCOTT
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
#else // not DYNAMIC_LIST
|
|
||||||
EXTERN gotoblas_t gotoblas_KATMAI;
|
|
||||||
EXTERN gotoblas_t gotoblas_COPPERMINE;
|
|
||||||
EXTERN gotoblas_t gotoblas_NORTHWOOD;
|
|
||||||
EXTERN gotoblas_t gotoblas_BANIAS;
|
|
||||||
EXTERN gotoblas_t gotoblas_ATHLON;
|
|
||||||
|
|
||||||
extern gotoblas_t gotoblas_PRESCOTT;
|
|
||||||
extern gotoblas_t gotoblas_CORE2;
|
|
||||||
extern gotoblas_t gotoblas_NEHALEM;
|
|
||||||
extern gotoblas_t gotoblas_BARCELONA;
|
|
||||||
#ifdef DYNAMIC_OLDER
|
|
||||||
extern gotoblas_t gotoblas_ATOM;
|
|
||||||
extern gotoblas_t gotoblas_NANO;
|
|
||||||
extern gotoblas_t gotoblas_PENRYN;
|
|
||||||
extern gotoblas_t gotoblas_DUNNINGTON;
|
|
||||||
extern gotoblas_t gotoblas_OPTERON;
|
|
||||||
extern gotoblas_t gotoblas_OPTERON_SSE3;
|
|
||||||
extern gotoblas_t gotoblas_BOBCAT;
|
|
||||||
#else
|
|
||||||
#define gotoblas_ATOM gotoblas_NEHALEM
|
|
||||||
#define gotoblas_NANO gotoblas_NEHALEM
|
|
||||||
#define gotoblas_PENRYN gotoblas_CORE2
|
|
||||||
#define gotoblas_DUNNINGTON gotoblas_CORE2
|
|
||||||
#define gotoblas_OPTERON gotoblas_CORE2
|
|
||||||
#define gotoblas_OPTERON_SSE3 gotoblas_CORE2
|
|
||||||
#define gotoblas_BOBCAT gotoblas_CORE2
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef NO_AVX
|
|
||||||
extern gotoblas_t gotoblas_SANDYBRIDGE;
|
|
||||||
extern gotoblas_t gotoblas_BULLDOZER;
|
|
||||||
extern gotoblas_t gotoblas_PILEDRIVER;
|
|
||||||
extern gotoblas_t gotoblas_STEAMROLLER;
|
|
||||||
extern gotoblas_t gotoblas_EXCAVATOR;
|
|
||||||
#ifdef NO_AVX2
|
|
||||||
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
|
|
||||||
#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE
|
|
||||||
#define gotoblas_ZEN gotoblas_SANDYBRIDGE
|
|
||||||
#else
|
|
||||||
extern gotoblas_t gotoblas_HASWELL;
|
|
||||||
extern gotoblas_t gotoblas_ZEN;
|
|
||||||
#ifndef NO_AVX512
|
|
||||||
extern gotoblas_t gotoblas_SKYLAKEX;
|
|
||||||
#else
|
|
||||||
#define gotoblas_SKYLAKEX gotoblas_HASWELL
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
#else
|
|
||||||
//Use NEHALEM kernels for sandy bridge
|
|
||||||
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
|
|
||||||
#define gotoblas_HASWELL gotoblas_NEHALEM
|
|
||||||
#define gotoblas_SKYLAKEX gotoblas_NEHALEM
|
|
||||||
#define gotoblas_BULLDOZER gotoblas_BARCELONA
|
|
||||||
#define gotoblas_PILEDRIVER gotoblas_BARCELONA
|
|
||||||
#define gotoblas_STEAMROLLER gotoblas_BARCELONA
|
|
||||||
#define gotoblas_EXCAVATOR gotoblas_BARCELONA
|
|
||||||
#define gotoblas_ZEN gotoblas_BARCELONA
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif // DYNAMIC_LIST
|
|
||||||
|
|
||||||
#define VENDOR_INTEL 1
|
|
||||||
#define VENDOR_AMD 2
|
|
||||||
#define VENDOR_CENTAUR 3
|
|
||||||
#define VENDOR_HYGON 4
|
|
||||||
#define VENDOR_UNKNOWN 99
|
|
||||||
|
|
||||||
#define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
|
|
||||||
|
|
||||||
#ifndef NO_AVX
|
|
||||||
static inline void xgetbv(int op, int * eax, int * edx){
|
|
||||||
//Use binary code for xgetbv
|
|
||||||
__asm__ __volatile__
|
|
||||||
(".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc");
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
int support_avx(){
|
|
||||||
#ifndef NO_AVX
|
|
||||||
int eax, ebx, ecx, edx;
|
|
||||||
int ret=0;
|
|
||||||
|
|
||||||
cpuid(1, &eax, &ebx, &ecx, &edx);
|
|
||||||
if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0){
|
|
||||||
xgetbv(0, &eax, &edx);
|
|
||||||
if((eax & 6) == 6){
|
|
||||||
ret=1; //OS support AVX
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
#else
|
|
||||||
return 0;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
int support_avx2(){
|
|
||||||
#ifndef NO_AVX2
|
|
||||||
int eax, ebx, ecx=0, edx;
|
|
||||||
int ret=0;
|
|
||||||
|
|
||||||
if (!support_avx())
|
|
||||||
return 0;
|
|
||||||
cpuid(7, &eax, &ebx, &ecx, &edx);
|
|
||||||
if((ebx & (1<<7)) != 0)
|
|
||||||
ret=1; //OS supports AVX2
|
|
||||||
return ret;
|
|
||||||
#else
|
|
||||||
return 0;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
int support_avx512(){
|
|
||||||
#if !defined(NO_AVX) && !defined(NO_AVX512)
|
|
||||||
int eax, ebx, ecx, edx;
|
|
||||||
int ret=0;
|
|
||||||
|
|
||||||
if (!support_avx())
|
|
||||||
return 0;
|
|
||||||
cpuid(7, &eax, &ebx, &ecx, &edx);
|
|
||||||
if((ebx & (1<<7)) != 1){
|
|
||||||
ret=0; //OS does not even support AVX2
|
|
||||||
}
|
|
||||||
if((ebx & (1<<31)) != 0){
|
|
||||||
xgetbv(0, &eax, &edx);
|
|
||||||
if((eax & 0xe0) == 0xe0)
|
|
||||||
ret=1; //OS supports AVX512VL
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
#else
|
|
||||||
return 0;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
extern void openblas_warning(int verbose, const char * msg);
|
|
||||||
#define FALLBACK_VERBOSE 1
|
|
||||||
#define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n"
|
|
||||||
#define SANDYBRIDGE_FALLBACK "OpenBLAS : Your OS does not support AVX2 instructions. OpenBLAS is using Sandybridge kernels as a fallback, which may give poorer performance.\n"
|
|
||||||
#define HASWELL_FALLBACK "OpenBLAS : Your OS does not support AVX512VL instructions. OpenBLAS is using Haswell kernels as a fallback, which may give poorer performance.\n"
|
|
||||||
#define BARCELONA_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n"
|
|
||||||
|
|
||||||
static int get_vendor(void){
|
|
||||||
int eax, ebx, ecx, edx;
|
|
||||||
|
|
||||||
union
|
|
||||||
{
|
|
||||||
char vchar[16];
|
|
||||||
int vint[4];
|
|
||||||
} vendor;
|
|
||||||
|
|
||||||
cpuid(0, &eax, &ebx, &ecx, &edx);
|
|
||||||
|
|
||||||
*(&vendor.vint[0]) = ebx;
|
|
||||||
*(&vendor.vint[1]) = edx;
|
|
||||||
*(&vendor.vint[2]) = ecx;
|
|
||||||
|
|
||||||
vendor.vchar[12] = '\0';
|
|
||||||
|
|
||||||
if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL;
|
|
||||||
if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD;
|
|
||||||
if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR;
|
|
||||||
if (!strcmp(vendor.vchar, "HygonGenuine")) return VENDOR_HYGON;
|
|
||||||
|
|
||||||
if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL;
|
|
||||||
|
|
||||||
return VENDOR_UNKNOWN;
|
|
||||||
}
|
|
||||||
|
|
||||||
static gotoblas_t *get_coretype(void){
|
|
||||||
|
|
||||||
int eax, ebx, ecx, edx;
|
|
||||||
int family, exfamily, model, vendor, exmodel;
|
|
||||||
|
|
||||||
cpuid(1, &eax, &ebx, &ecx, &edx);
|
|
||||||
|
|
||||||
family = BITMASK(eax, 8, 0x0f);
|
|
||||||
exfamily = BITMASK(eax, 20, 0xff);
|
|
||||||
model = BITMASK(eax, 4, 0x0f);
|
|
||||||
exmodel = BITMASK(eax, 16, 0x0f);
|
|
||||||
|
|
||||||
vendor = get_vendor();
|
|
||||||
|
|
||||||
if (vendor == VENDOR_INTEL){
|
|
||||||
switch (family) {
|
|
||||||
case 0x6:
|
|
||||||
switch (exmodel) {
|
|
||||||
case 0:
|
|
||||||
if (model <= 0x7) return &gotoblas_KATMAI;
|
|
||||||
if ((model == 0x8) || (model == 0xa) || (model == 0xb)) return &gotoblas_COPPERMINE;
|
|
||||||
if ((model == 0x9) || (model == 0xd)) return &gotoblas_BANIAS;
|
|
||||||
if (model == 14) return &gotoblas_BANIAS;
|
|
||||||
if (model == 15) return &gotoblas_CORE2;
|
|
||||||
return NULL;
|
|
||||||
|
|
||||||
case 1:
|
|
||||||
if (model == 6) return &gotoblas_CORE2;
|
|
||||||
if (model == 7) return &gotoblas_PENRYN;
|
|
||||||
if (model == 13) return &gotoblas_DUNNINGTON;
|
|
||||||
if ((model == 10) || (model == 11) || (model == 14) || (model == 15)) return &gotoblas_NEHALEM;
|
|
||||||
if (model == 12) return &gotoblas_ATOM;
|
|
||||||
return NULL;
|
|
||||||
|
|
||||||
case 2:
|
|
||||||
//Intel Core (Clarkdale) / Core (Arrandale)
|
|
||||||
// Pentium (Clarkdale) / Pentium Mobile (Arrandale)
|
|
||||||
// Xeon (Clarkdale), 32nm
|
|
||||||
if (model == 5) return &gotoblas_NEHALEM;
|
|
||||||
|
|
||||||
//Intel Xeon Processor 5600 (Westmere-EP)
|
|
||||||
//Xeon Processor E7 (Westmere-EX)
|
|
||||||
//Xeon E7540
|
|
||||||
if (model == 12 || model == 14 || model == 15) return &gotoblas_NEHALEM;
|
|
||||||
|
|
||||||
//Intel Core i5-2000 /i7-2000 (Sandy Bridge)
|
|
||||||
//Intel Core i7-3000 / Xeon E5
|
|
||||||
if (model == 10 || model == 13) {
|
|
||||||
if(support_avx())
|
|
||||||
return &gotoblas_SANDYBRIDGE;
|
|
||||||
else{
|
|
||||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
|
||||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return NULL;
|
|
||||||
case 3:
|
|
||||||
//Intel Sandy Bridge 22nm (Ivy Bridge?)
|
|
||||||
if (model == 10 || model == 14) {
|
|
||||||
if(support_avx())
|
|
||||||
return &gotoblas_SANDYBRIDGE;
|
|
||||||
else{
|
|
||||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
|
||||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
|
||||||
}
|
|
||||||
}
|
|
||||||
//Intel Haswell
|
|
||||||
if (model == 12 || model == 15) {
|
|
||||||
if(support_avx2())
|
|
||||||
return &gotoblas_HASWELL;
|
|
||||||
if(support_avx()) {
|
|
||||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
|
||||||
return &gotoblas_SANDYBRIDGE;
|
|
||||||
} else {
|
|
||||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
|
||||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
|
||||||
}
|
|
||||||
}
|
|
||||||
//Intel Broadwell
|
|
||||||
if (model == 13) {
|
|
||||||
if(support_avx2())
|
|
||||||
return &gotoblas_HASWELL;
|
|
||||||
if(support_avx()) {
|
|
||||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
|
||||||
return &gotoblas_SANDYBRIDGE;
|
|
||||||
} else {
|
|
||||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
|
||||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (model == 7) return &gotoblas_ATOM; //Bay Trail
|
|
||||||
return NULL;
|
|
||||||
case 4:
|
|
||||||
//Intel Haswell
|
|
||||||
if (model == 5 || model == 6) {
|
|
||||||
if(support_avx2())
|
|
||||||
return &gotoblas_HASWELL;
|
|
||||||
if(support_avx()) {
|
|
||||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
|
||||||
return &gotoblas_SANDYBRIDGE;
|
|
||||||
} else {
|
|
||||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
|
||||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
|
||||||
}
|
|
||||||
}
|
|
||||||
//Intel Broadwell
|
|
||||||
if (model == 7 || model == 15) {
|
|
||||||
if(support_avx2())
|
|
||||||
return &gotoblas_HASWELL;
|
|
||||||
if(support_avx()) {
|
|
||||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
|
||||||
return &gotoblas_SANDYBRIDGE;
|
|
||||||
} else {
|
|
||||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
|
||||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
|
||||||
}
|
|
||||||
}
|
|
||||||
//Intel Skylake
|
|
||||||
if (model == 14) {
|
|
||||||
if(support_avx2())
|
|
||||||
return &gotoblas_HASWELL;
|
|
||||||
if(support_avx()) {
|
|
||||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
|
||||||
return &gotoblas_SANDYBRIDGE;
|
|
||||||
} else {
|
|
||||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
|
||||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
|
||||||
}
|
|
||||||
}
|
|
||||||
//Intel Braswell / Avoton
|
|
||||||
if (model == 12 || model == 13) {
|
|
||||||
return &gotoblas_NEHALEM;
|
|
||||||
}
|
|
||||||
return NULL;
|
|
||||||
case 5:
|
|
||||||
//Intel Broadwell
|
|
||||||
if (model == 6) {
|
|
||||||
if(support_avx2())
|
|
||||||
return &gotoblas_HASWELL;
|
|
||||||
if(support_avx()) {
|
|
||||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
|
||||||
return &gotoblas_SANDYBRIDGE;
|
|
||||||
} else {
|
|
||||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
|
||||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (model == 5) {
|
|
||||||
// Intel Skylake X
|
|
||||||
if (support_avx512())
|
|
||||||
return &gotoblas_SKYLAKEX;
|
|
||||||
if(support_avx2()){
|
|
||||||
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
|
|
||||||
return &gotoblas_HASWELL;
|
|
||||||
}
|
|
||||||
if(support_avx()) {
|
|
||||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
|
||||||
return &gotoblas_SANDYBRIDGE;
|
|
||||||
} else {
|
|
||||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
|
||||||
return &gotoblas_NEHALEM;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
//Intel Skylake
|
|
||||||
if (model == 14) {
|
|
||||||
if(support_avx2())
|
|
||||||
return &gotoblas_HASWELL;
|
|
||||||
if(support_avx()) {
|
|
||||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
|
||||||
return &gotoblas_SANDYBRIDGE;
|
|
||||||
} else {
|
|
||||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
|
||||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
|
||||||
}
|
|
||||||
}
|
|
||||||
//Intel Phi Knights Landing
|
|
||||||
if (model == 7) {
|
|
||||||
if(support_avx2()){
|
|
||||||
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
|
|
||||||
return &gotoblas_HASWELL;
|
|
||||||
}
|
|
||||||
if(support_avx()) {
|
|
||||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
|
||||||
return &gotoblas_SANDYBRIDGE;
|
|
||||||
} else {
|
|
||||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
|
||||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
|
||||||
}
|
|
||||||
}
|
|
||||||
//Apollo Lake or Denverton
|
|
||||||
if (model == 12 || model == 15) {
|
|
||||||
return &gotoblas_NEHALEM;
|
|
||||||
}
|
|
||||||
return NULL;
|
|
||||||
case 6:
|
|
||||||
if (model == 6) {
|
|
||||||
// Cannon Lake
|
|
||||||
if(support_avx2())
|
|
||||||
return &gotoblas_HASWELL;
|
|
||||||
if(support_avx()) {
|
|
||||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
|
||||||
return &gotoblas_SANDYBRIDGE;
|
|
||||||
} else {
|
|
||||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
|
||||||
return &gotoblas_NEHALEM;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return NULL;
|
|
||||||
case 7:
|
|
||||||
if (model == 10) // Goldmont plus
|
|
||||||
return &gotoblas_NEHALEM;
|
|
||||||
if (model == 14) {
|
|
||||||
// Ice Lake
|
|
||||||
if (support_avx512())
|
|
||||||
return &gotoblas_SKYLAKEX;
|
|
||||||
if(support_avx2()){
|
|
||||||
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
|
|
||||||
return &gotoblas_HASWELL;
|
|
||||||
}
|
|
||||||
if(support_avx()) {
|
|
||||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
|
||||||
return &gotoblas_SANDYBRIDGE;
|
|
||||||
} else {
|
|
||||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
|
||||||
return &gotoblas_NEHALEM;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return NULL;
|
|
||||||
case 9:
|
|
||||||
case 8:
|
|
||||||
if (model == 14 ) { // Kaby Lake, Coffee Lake
|
|
||||||
if(support_avx2())
|
|
||||||
return &gotoblas_HASWELL;
|
|
||||||
if(support_avx()) {
|
|
||||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
|
||||||
return &gotoblas_SANDYBRIDGE;
|
|
||||||
} else {
|
|
||||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
|
||||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
case 0xf:
|
|
||||||
if (model <= 0x2) return &gotoblas_NORTHWOOD;
|
|
||||||
return &gotoblas_PRESCOTT;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (vendor == VENDOR_AMD || vendor == VENDOR_HYGON){
|
|
||||||
if (family <= 0xe) {
|
|
||||||
// Verify that CPU has 3dnow and 3dnowext before claiming it is Athlon
|
|
||||||
cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
|
|
||||||
if ( (eax & 0xffff) >= 0x01) {
|
|
||||||
cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
|
|
||||||
if ((edx & (1 << 30)) == 0 || (edx & (1 << 31)) == 0)
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
return NULL;
|
|
||||||
|
|
||||||
return &gotoblas_ATHLON;
|
|
||||||
}
|
|
||||||
if (family == 0xf){
|
|
||||||
if ((exfamily == 0) || (exfamily == 2)) {
|
|
||||||
if (ecx & (1 << 0)) return &gotoblas_OPTERON_SSE3;
|
|
||||||
else return &gotoblas_OPTERON;
|
|
||||||
} else if (exfamily == 5) {
|
|
||||||
return &gotoblas_BOBCAT;
|
|
||||||
} else if (exfamily == 6) {
|
|
||||||
if(model == 1){
|
|
||||||
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
|
|
||||||
if(support_avx())
|
|
||||||
return &gotoblas_BULLDOZER;
|
|
||||||
else{
|
|
||||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
|
||||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
|
||||||
}
|
|
||||||
}else if(model == 2 || model == 3){
|
|
||||||
//AMD Bulldozer Opteron 6300 / Opteron 4300 / Opteron 3300
|
|
||||||
if(support_avx())
|
|
||||||
return &gotoblas_PILEDRIVER;
|
|
||||||
else{
|
|
||||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
|
||||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
|
||||||
}
|
|
||||||
}else if(model == 5){
|
|
||||||
if(support_avx())
|
|
||||||
return &gotoblas_EXCAVATOR;
|
|
||||||
else{
|
|
||||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
|
||||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
|
||||||
}
|
|
||||||
}else if(model == 0 || model == 8){
|
|
||||||
if (exmodel == 1) {
|
|
||||||
//AMD Trinity
|
|
||||||
if(support_avx())
|
|
||||||
return &gotoblas_PILEDRIVER;
|
|
||||||
else{
|
|
||||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
|
||||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
|
||||||
}
|
|
||||||
}else if (exmodel == 3) {
|
|
||||||
//AMD STEAMROLLER
|
|
||||||
if(support_avx())
|
|
||||||
return &gotoblas_STEAMROLLER;
|
|
||||||
else{
|
|
||||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
|
||||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
|
||||||
}
|
|
||||||
}else if (exmodel == 6) {
|
|
||||||
if(support_avx())
|
|
||||||
return &gotoblas_EXCAVATOR;
|
|
||||||
else{
|
|
||||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
|
||||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if (exfamily == 8) {
|
|
||||||
if (model == 1 || model == 8) {
|
|
||||||
if(support_avx())
|
|
||||||
return &gotoblas_ZEN;
|
|
||||||
else{
|
|
||||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
|
||||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if (exfamily == 9) {
|
|
||||||
if(support_avx())
|
|
||||||
return &gotoblas_ZEN;
|
|
||||||
else{
|
|
||||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
|
||||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
|
||||||
}
|
|
||||||
}else {
|
|
||||||
return &gotoblas_BARCELONA;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (vendor == VENDOR_CENTAUR) {
|
|
||||||
switch (family) {
|
|
||||||
case 0x6:
|
|
||||||
return &gotoblas_NANO;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
static char *corename[] = {
|
|
||||||
"Unknown",
|
|
||||||
"Katmai",
|
|
||||||
"Coppermine",
|
|
||||||
"Northwood",
|
|
||||||
"Prescott",
|
|
||||||
"Banias",
|
|
||||||
"Atom",
|
|
||||||
"Core2",
|
|
||||||
"Penryn",
|
|
||||||
"Dunnington",
|
|
||||||
"Nehalem",
|
|
||||||
"Athlon",
|
|
||||||
"Opteron",
|
|
||||||
"Opteron_SSE3",
|
|
||||||
"Barcelona",
|
|
||||||
"Nano",
|
|
||||||
"Sandybridge",
|
|
||||||
"Bobcat",
|
|
||||||
"Bulldozer",
|
|
||||||
"Piledriver",
|
|
||||||
"Haswell",
|
|
||||||
"Steamroller",
|
|
||||||
"Excavator",
|
|
||||||
"Zen",
|
|
||||||
"SkylakeX"
|
|
||||||
};
|
|
||||||
|
|
||||||
char *gotoblas_corename(void) {
|
|
||||||
|
|
||||||
if (gotoblas == &gotoblas_KATMAI) return corename[ 1];
|
|
||||||
if (gotoblas == &gotoblas_COPPERMINE) return corename[ 2];
|
|
||||||
if (gotoblas == &gotoblas_NORTHWOOD) return corename[ 3];
|
|
||||||
if (gotoblas == &gotoblas_PRESCOTT) return corename[ 4];
|
|
||||||
if (gotoblas == &gotoblas_BANIAS) return corename[ 5];
|
|
||||||
if (gotoblas == &gotoblas_ATOM) return corename[ 6];
|
|
||||||
if (gotoblas == &gotoblas_CORE2) return corename[ 7];
|
|
||||||
if (gotoblas == &gotoblas_PENRYN) return corename[ 8];
|
|
||||||
if (gotoblas == &gotoblas_DUNNINGTON) return corename[ 9];
|
|
||||||
if (gotoblas == &gotoblas_NEHALEM) return corename[10];
|
|
||||||
if (gotoblas == &gotoblas_ATHLON) return corename[11];
|
|
||||||
if (gotoblas == &gotoblas_OPTERON_SSE3) return corename[12];
|
|
||||||
if (gotoblas == &gotoblas_OPTERON) return corename[13];
|
|
||||||
if (gotoblas == &gotoblas_BARCELONA) return corename[14];
|
|
||||||
if (gotoblas == &gotoblas_NANO) return corename[15];
|
|
||||||
if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16];
|
|
||||||
if (gotoblas == &gotoblas_BOBCAT) return corename[17];
|
|
||||||
if (gotoblas == &gotoblas_BULLDOZER) return corename[18];
|
|
||||||
if (gotoblas == &gotoblas_PILEDRIVER) return corename[19];
|
|
||||||
if (gotoblas == &gotoblas_HASWELL) return corename[20];
|
|
||||||
if (gotoblas == &gotoblas_STEAMROLLER) return corename[21];
|
|
||||||
if (gotoblas == &gotoblas_EXCAVATOR) return corename[22];
|
|
||||||
if (gotoblas == &gotoblas_ZEN) return corename[23];
|
|
||||||
if (gotoblas == &gotoblas_SKYLAKEX) return corename[24];
|
|
||||||
return corename[0];
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
static gotoblas_t *force_coretype(char *coretype){
|
|
||||||
|
|
||||||
int i ;
|
|
||||||
int found = -1;
|
|
||||||
char message[128];
|
|
||||||
//char mname[20];
|
|
||||||
|
|
||||||
for ( i=1 ; i <= 24; i++)
|
|
||||||
{
|
|
||||||
if (!strncasecmp(coretype,corename[i],20))
|
|
||||||
{
|
|
||||||
found = i;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (found < 0)
|
|
||||||
{
|
|
||||||
//strncpy(mname,coretype,20);
|
|
||||||
snprintf(message, 128, "Core not found: %s\n",coretype);
|
|
||||||
openblas_warning(1, message);
|
|
||||||
return(NULL);
|
|
||||||
}
|
|
||||||
|
|
||||||
switch (found)
|
|
||||||
{
|
|
||||||
case 24: return (&gotoblas_SKYLAKEX);
|
|
||||||
case 23: return (&gotoblas_ZEN);
|
|
||||||
case 22: return (&gotoblas_EXCAVATOR);
|
|
||||||
case 21: return (&gotoblas_STEAMROLLER);
|
|
||||||
case 20: return (&gotoblas_HASWELL);
|
|
||||||
case 19: return (&gotoblas_PILEDRIVER);
|
|
||||||
case 18: return (&gotoblas_BULLDOZER);
|
|
||||||
case 17: return (&gotoblas_BOBCAT);
|
|
||||||
case 16: return (&gotoblas_SANDYBRIDGE);
|
|
||||||
case 15: return (&gotoblas_NANO);
|
|
||||||
case 14: return (&gotoblas_BARCELONA);
|
|
||||||
case 13: return (&gotoblas_OPTERON);
|
|
||||||
case 12: return (&gotoblas_OPTERON_SSE3);
|
|
||||||
case 11: return (&gotoblas_ATHLON);
|
|
||||||
case 10: return (&gotoblas_NEHALEM);
|
|
||||||
case 9: return (&gotoblas_DUNNINGTON);
|
|
||||||
case 8: return (&gotoblas_PENRYN);
|
|
||||||
case 7: return (&gotoblas_CORE2);
|
|
||||||
case 6: return (&gotoblas_ATOM);
|
|
||||||
case 5: return (&gotoblas_BANIAS);
|
|
||||||
case 4: return (&gotoblas_PRESCOTT);
|
|
||||||
case 3: return (&gotoblas_NORTHWOOD);
|
|
||||||
case 2: return (&gotoblas_COPPERMINE);
|
|
||||||
case 1: return (&gotoblas_KATMAI);
|
|
||||||
}
|
|
||||||
return(NULL);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
void gotoblas_dynamic_init(void) {
|
|
||||||
|
|
||||||
char coremsg[128];
|
|
||||||
char coren[22];
|
|
||||||
char *p;
|
|
||||||
|
|
||||||
|
|
||||||
if (gotoblas) return;
|
|
||||||
|
|
||||||
p = getenv("OPENBLAS_CORETYPE");
|
|
||||||
if ( p )
|
|
||||||
{
|
|
||||||
gotoblas = force_coretype(p);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
gotoblas = get_coretype();
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef ARCH_X86
|
|
||||||
if (gotoblas == NULL) gotoblas = &gotoblas_KATMAI;
|
|
||||||
#else
|
|
||||||
if (gotoblas == NULL) gotoblas = &gotoblas_PRESCOTT;
|
|
||||||
/* sanity check, if 64bit pointer we can't have a 32 bit cpu */
|
|
||||||
if (sizeof(void*) == 8) {
|
|
||||||
if (gotoblas == &gotoblas_KATMAI ||
|
|
||||||
gotoblas == &gotoblas_COPPERMINE ||
|
|
||||||
gotoblas == &gotoblas_NORTHWOOD ||
|
|
||||||
gotoblas == &gotoblas_BANIAS ||
|
|
||||||
gotoblas == &gotoblas_ATHLON)
|
|
||||||
gotoblas = &gotoblas_PRESCOTT;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if (gotoblas && gotoblas -> init) {
|
|
||||||
strncpy(coren,gotoblas_corename(),20);
|
|
||||||
sprintf(coremsg, "Core: %s\n",coren);
|
|
||||||
openblas_warning(2, coremsg);
|
|
||||||
gotoblas -> init();
|
|
||||||
} else {
|
|
||||||
openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
void gotoblas_dynamic_quit(void) {
|
|
||||||
|
|
||||||
gotoblas = NULL;
|
|
||||||
|
|
||||||
}
|
|
2
f_check
2
f_check
|
@ -71,7 +71,7 @@ if ($compiler eq "") {
|
||||||
|
|
||||||
if ($data =~ /GNU/) {
|
if ($data =~ /GNU/) {
|
||||||
|
|
||||||
$data =~ /(\d)\.(\d).(\d)/;
|
$data =~ /(\d+)\.(\d+).(\d+)/;
|
||||||
$major = $1;
|
$major = $1;
|
||||||
$minor = $2;
|
$minor = $2;
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
USE_GEMM3M = 0
|
USE_GEMM3M = 0
|
||||||
|
OS := $(shell uname)
|
||||||
|
|
||||||
ifeq ($(ARCH), x86)
|
ifeq ($(ARCH), x86)
|
||||||
USE_GEMM3M = 1
|
USE_GEMM3M = 1
|
||||||
|
@ -59,8 +60,6 @@ USE_TRMM = 1
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
SKERNELOBJS += \
|
SKERNELOBJS += \
|
||||||
sgemm_kernel$(TSUFFIX).$(SUFFIX) \
|
sgemm_kernel$(TSUFFIX).$(SUFFIX) \
|
||||||
$(SGEMMINCOPYOBJ) $(SGEMMITCOPYOBJ) \
|
$(SGEMMINCOPYOBJ) $(SGEMMITCOPYOBJ) \
|
||||||
|
@ -438,7 +437,15 @@ $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY)
|
||||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||||
|
|
||||||
$(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY)
|
$(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY)
|
||||||
|
ifeq ($(OS), AIX)
|
||||||
|
$(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmotcopy.s
|
||||||
|
m4 sgemmotcopy.s > sgemmotcopy_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmotcopy_nomacros.s -o $@
|
||||||
|
rm sgemmotcopy.s sgemmotcopy_nomacros.s
|
||||||
|
else
|
||||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
|
|
||||||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||||
|
|
||||||
|
@ -446,12 +453,26 @@ $(KDIR)$(SGEMMINCOPYOBJ) : $(KERNELDIR)/$(SGEMMINCOPY)
|
||||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||||
|
|
||||||
$(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY)
|
$(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY)
|
||||||
|
ifeq ($(OS), AIX)
|
||||||
|
$(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmitcopy.s
|
||||||
|
m4 sgemmitcopy.s > sgemmitcopy_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmitcopy_nomacros.s -o $@
|
||||||
|
rm sgemmitcopy.s sgemmitcopy_nomacros.s
|
||||||
|
else
|
||||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
endif
|
endif
|
||||||
|
|
||||||
$(KDIR)$(DGEMMONCOPYOBJ) : $(KERNELDIR)/$(DGEMMONCOPY)
|
$(KDIR)$(DGEMMONCOPYOBJ) : $(KERNELDIR)/$(DGEMMONCOPY)
|
||||||
|
ifeq ($(OS), AIX)
|
||||||
|
$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_ncopy.s
|
||||||
|
m4 dgemm_ncopy.s > dgemm_ncopy_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_ncopy_nomacros.s -o $@
|
||||||
|
rm dgemm_ncopy.s dgemm_ncopy_nomacros.s
|
||||||
|
else
|
||||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
$(KDIR)$(DGEMMOTCOPYOBJ) : $(KERNELDIR)/$(DGEMMOTCOPY)
|
$(KDIR)$(DGEMMOTCOPYOBJ) : $(KERNELDIR)/$(DGEMMOTCOPY)
|
||||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
||||||
|
@ -462,7 +483,14 @@ $(KDIR)$(DGEMMINCOPYOBJ) : $(KERNELDIR)/$(DGEMMINCOPY)
|
||||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
||||||
|
|
||||||
$(KDIR)$(DGEMMITCOPYOBJ) : $(KERNELDIR)/$(DGEMMITCOPY)
|
$(KDIR)$(DGEMMITCOPYOBJ) : $(KERNELDIR)/$(DGEMMITCOPY)
|
||||||
|
ifeq ($(OS), AIX)
|
||||||
|
$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_itcopy.s
|
||||||
|
m4 dgemm_itcopy.s > dgemm_itcopy_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_itcopy_nomacros.s -o $@
|
||||||
|
rm dgemm_itcopy.s dgemm_itcopy_nomacros.s
|
||||||
|
else
|
||||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
@ -498,7 +526,14 @@ $(KDIR)$(CGEMMINCOPYOBJ) : $(KERNELDIR)/$(CGEMMINCOPY)
|
||||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||||
|
|
||||||
$(KDIR)$(CGEMMITCOPYOBJ) : $(KERNELDIR)/$(CGEMMITCOPY)
|
$(KDIR)$(CGEMMITCOPYOBJ) : $(KERNELDIR)/$(CGEMMITCOPY)
|
||||||
|
ifeq ($(OS), AIX)
|
||||||
|
$(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -E $< -o cgemm_itcopy.s
|
||||||
|
m4 cgemm_itcopy.s > cgemm_itcopy_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX cgemm_itcopy_nomacros.s -o $@
|
||||||
|
rm cgemm_itcopy.s cgemm_itcopy_nomacros.s
|
||||||
|
else
|
||||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
@ -514,7 +549,14 @@ $(KDIR)$(ZGEMMINCOPYOBJ) : $(KERNELDIR)/$(ZGEMMINCOPY)
|
||||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
||||||
|
|
||||||
$(KDIR)$(ZGEMMITCOPYOBJ) : $(KERNELDIR)/$(ZGEMMITCOPY)
|
$(KDIR)$(ZGEMMITCOPYOBJ) : $(KERNELDIR)/$(ZGEMMITCOPY)
|
||||||
|
ifeq ($(OS), AIX)
|
||||||
|
$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o zgemm_itcopy.s
|
||||||
|
m4 zgemm_itcopy.s > zgemm_itcopy_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX zgemm_itcopy_nomacros.s -o $@
|
||||||
|
rm zgemm_itcopy.s zgemm_itcopy_nomacros.s
|
||||||
|
else
|
||||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
@ -539,37 +581,107 @@ endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
$(KDIR)sgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND)
|
$(KDIR)sgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND)
|
||||||
|
ifeq ($(OS), AIX)
|
||||||
|
$(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemm_kernel$(TSUFFIX).s
|
||||||
|
m4 sgemm_kernel$(TSUFFIX).s > sgemm_kernel$(TSUFFIX)_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemm_kernel$(TSUFFIX)_nomacros.s -o $@
|
||||||
|
rm sgemm_kernel$(TSUFFIX).s sgemm_kernel$(TSUFFIX)_nomacros.s
|
||||||
|
else
|
||||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
$(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND)
|
$(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND)
|
||||||
|
ifeq ($(OS), AIX)
|
||||||
|
$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_kernel$(TSUFFIX).s
|
||||||
|
m4 dgemm_kernel$(TSUFFIX).s > dgemm_kernel$(TSUFFIX)_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_kernel$(TSUFFIX)_nomacros.s -o $@
|
||||||
|
rm dgemm_kernel$(TSUFFIX).s dgemm_kernel$(TSUFFIX)_nomacros.s
|
||||||
|
else
|
||||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
$(KDIR)qgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEPEND)
|
$(KDIR)qgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEPEND)
|
||||||
$(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
|
$(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
|
||||||
|
|
||||||
$(KDIR)cgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
$(KDIR)cgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
||||||
|
ifeq ($(OS), AIX)
|
||||||
|
$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNN $< -o cgemm_kernel_n.s
|
||||||
|
m4 cgemm_kernel_n.s > cgemm_kernel_n_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN cgemm_kernel_n_nomacros.s -o $@
|
||||||
|
rm cgemm_kernel_n.s cgemm_kernel_n_nomacros.s
|
||||||
|
else
|
||||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@
|
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
$(KDIR)cgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
$(KDIR)cgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
||||||
|
ifeq ($(OS), AIX)
|
||||||
|
$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCN $< -o cgemm_kernel_l.s
|
||||||
|
m4 cgemm_kernel_l.s > cgemm_kernel_l_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN cgemm_kernel_l_nomacros.s -o $@
|
||||||
|
rm cgemm_kernel_l.s cgemm_kernel_l_nomacros.s
|
||||||
|
else
|
||||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@
|
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
$(KDIR)cgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
$(KDIR)cgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
||||||
|
ifeq ($(OS), AIX)
|
||||||
|
$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s
|
||||||
|
m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@
|
||||||
|
rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s
|
||||||
|
else
|
||||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@
|
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
$(KDIR)cgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
$(KDIR)cgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
||||||
|
ifeq ($(OS), AIX)
|
||||||
|
$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCC $< -o cgemm_kernel_b.s
|
||||||
|
m4 cgemm_kernel_b.s > cgemm_kernel_b_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC cgemm_kernel_b_nomacros.s -o $@
|
||||||
|
rm cgemm_kernel_b.s cgemm_kernel_b_nomacros.s
|
||||||
|
else
|
||||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@
|
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
$(KDIR)zgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
|
$(KDIR)zgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
|
||||||
|
ifeq ($(OS), AIX)
|
||||||
|
$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNN $< -o zgemm_kernel_n.s
|
||||||
|
m4 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@
|
||||||
|
rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s
|
||||||
|
else
|
||||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@
|
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
$(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
|
$(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
|
||||||
|
ifeq ($(OS), AIX)
|
||||||
|
$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCN $< -o zgemm_kernel_l.s
|
||||||
|
m4 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@
|
||||||
|
rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s
|
||||||
|
else
|
||||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@
|
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
$(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
|
$(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
|
||||||
|
ifeq ($(OS), AIX)
|
||||||
|
$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNC $< -o zgemm_kernel_r.s
|
||||||
|
m4 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@
|
||||||
|
rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s
|
||||||
|
else
|
||||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@
|
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
$(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
|
$(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
|
||||||
|
ifeq ($(OS), AIX)
|
||||||
|
$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCC $< -o zgemm_kernel_b.s
|
||||||
|
m4 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@
|
||||||
|
rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s
|
||||||
|
else
|
||||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@
|
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
$(KDIR)xgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND)
|
$(KDIR)xgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND)
|
||||||
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@
|
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@
|
||||||
|
@ -586,28 +698,84 @@ $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD
|
||||||
|
|
||||||
ifdef USE_TRMM
|
ifdef USE_TRMM
|
||||||
$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
||||||
|
ifeq ($(OS), AIX)
|
||||||
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o strmmkernel_ln.s
|
||||||
|
m4 strmmkernel_ln.s > strmmkernel_ln_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA strmmkernel_ln_nomacros.s -o $@
|
||||||
|
rm strmmkernel_ln.s strmmkernel_ln_nomacros.s
|
||||||
|
else
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
$(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
$(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
||||||
|
ifeq ($(OS), AIX)
|
||||||
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o strmmkernel_lt.s
|
||||||
|
m4 strmmkernel_lt.s > strmmkernel_lt_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA strmmkernel_lt_nomacros.s -o $@
|
||||||
|
rm strmmkernel_lt.s strmmkernel_lt_nomacros.s
|
||||||
|
else
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
$(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
$(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
||||||
|
ifeq ($(OS), AIX)
|
||||||
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o strmmkernel_rn.s
|
||||||
|
m4 strmmkernel_rn.s > strmmkernel_rn_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA strmmkernel_rn_nomacros.s -o $@
|
||||||
|
rm strmmkernel_rn.s strmmkernel_rn_nomacros.s
|
||||||
|
else
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
||||||
|
ifeq ($(OS), AIX)
|
||||||
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s
|
||||||
|
m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
|
||||||
|
rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
|
||||||
|
else
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
||||||
|
ifeq ($(OS), AIX)
|
||||||
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o dtrmm_kernel_ln.s
|
||||||
|
m4 dtrmm_kernel_ln.s > dtrmm_kernel_ln_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA dtrmm_kernel_ln_nomacros.s -o $@
|
||||||
|
rm dtrmm_kernel_ln.s dtrmm_kernel_ln_nomacros.s
|
||||||
|
else
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
||||||
|
ifeq ($(OS), AIX)
|
||||||
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o dtrmm_kernel_lt.s
|
||||||
|
m4 dtrmm_kernel_lt.s > dtrmm_kernel_lt_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA dtrmm_kernel_lt_nomacros.s -o $@
|
||||||
|
rm dtrmm_kernel_lt.s dtrmm_kernel_lt_nomacros.s
|
||||||
|
else
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
||||||
|
ifeq ($(OS), AIX)
|
||||||
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o dtrmm_kernel_rn.s
|
||||||
|
m4 dtrmm_kernel_rn.s > dtrmm_kernel_rn_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA dtrmm_kernel_rn_nomacros.s -o $@
|
||||||
|
rm dtrmm_kernel_rn.s dtrmm_kernel_rn_nomacros.s
|
||||||
|
else
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
||||||
|
ifeq ($(OS), AIX)
|
||||||
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o dtrmm_kernel_rt.s
|
||||||
|
m4 dtrmm_kernel_rt.s > dtrmm_kernel_rt_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA dtrmm_kernel_rt_nomacros.s -o $@
|
||||||
|
rm dtrmm_kernel_rt.s dtrmm_kernel_rt_nomacros.s
|
||||||
|
else
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
|
$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
|
||||||
|
@ -622,52 +790,165 @@ $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
|
||||||
|
|
||||||
$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||||
|
ifeq ($(OS), AIX)
|
||||||
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_ln.s
|
||||||
|
m4 ctrmm_kernel_ln.s > ctrmm_kernel_ln_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_ln_nomacros.s -o $@
|
||||||
|
rm ctrmm_kernel_ln.s ctrmm_kernel_ln_nomacros.s
|
||||||
|
else
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||||
|
ifeq ($(OS), AIX)
|
||||||
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_lt.s
|
||||||
|
m4 ctrmm_kernel_lt.s > ctrmm_kernel_lt_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_lt_nomacros.s -o $@
|
||||||
|
rm ctrmm_kernel_lt.s ctrmm_kernel_lt_nomacros.s
|
||||||
|
else
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||||
|
ifeq ($(OS), AIX)
|
||||||
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lr.s
|
||||||
|
m4 ctrmm_kernel_lr.s > ctrmm_kernel_lr_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ctrmm_kernel_lr_nomacros.s -o $@
|
||||||
|
rm ctrmm_kernel_lr.s ctrmm_kernel_lr_nomacros.s
|
||||||
|
else
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||||
|
ifeq ($(OS), AIX)
|
||||||
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lc.s
|
||||||
|
m4 ctrmm_kernel_lc.s > ctrmm_kernel_lc_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ctrmm_kernel_lc_nomacros.s -o $@
|
||||||
|
rm ctrmm_kernel_lc_nomacros.s ctrmm_kernel_lc.s
|
||||||
|
else
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||||
|
ifeq ($(OS), AIX)
|
||||||
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rn.s
|
||||||
|
m4 ctrmm_kernel_rn.s > ctrmm_kernel_rn_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_rn_nomacros.s -o $@
|
||||||
|
rm ctrmm_kernel_rn.s ctrmm_kernel_rn_nomacros.s
|
||||||
|
else
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||||
|
ifeq ($(OS), AIX)
|
||||||
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rt.s
|
||||||
|
m4 ctrmm_kernel_rt.s > ctrmm_kernel_rt_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_rt_nomacros.s -o $@
|
||||||
|
rm ctrmm_kernel_rt.s ctrmm_kernel_rt_nomacros.s
|
||||||
|
else
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||||
|
ifeq ($(OS), AIX)
|
||||||
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ctrmm_kernel_rr.s
|
||||||
|
m4 ctrmm_kernel_rr.s > ctrmm_kernel_rr_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ctrmm_kernel_rr_nomacros.s -o $@
|
||||||
|
rm ctrmm_kernel_rr.s ctrmm_kernel_rr_nomacros.s
|
||||||
|
else
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||||
|
ifeq ($(OS), AIX)
|
||||||
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ctrmm_kernel_RC.s
|
||||||
|
m4 ctrmm_kernel_RC.s > ctrmm_kernel_RC_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ctrmm_kernel_RC_nomacros.s -o $@
|
||||||
|
rm ctrmm_kernel_RC.s ctrmm_kernel_RC_nomacros.s
|
||||||
|
else
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||||
|
ifeq ($(OS), AIX)
|
||||||
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_ln.s
|
||||||
|
m4 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@
|
||||||
|
rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s
|
||||||
|
else
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||||
|
ifeq ($(OS), AIX)
|
||||||
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_lt.s
|
||||||
|
m4 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@
|
||||||
|
rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s
|
||||||
|
else
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||||
|
ifeq ($(OS), AIX)
|
||||||
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lr.s
|
||||||
|
m4 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@
|
||||||
|
rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s
|
||||||
|
else
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||||
|
ifeq ($(OS), AIX)
|
||||||
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lc.s
|
||||||
|
m4 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@
|
||||||
|
rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s
|
||||||
|
else
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||||
|
ifeq ($(OS), AIX)
|
||||||
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rn.s
|
||||||
|
m4 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@
|
||||||
|
rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s
|
||||||
|
else
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||||
|
ifeq ($(OS), AIX)
|
||||||
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rt.s
|
||||||
|
m4 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@
|
||||||
|
rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s
|
||||||
|
else
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||||
|
ifeq ($(OS), AIX)
|
||||||
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rr.s
|
||||||
|
m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@
|
||||||
|
rm ztrmm_kernel_rr.s ztrmm_kernel_rr_nomacros.s
|
||||||
|
else
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||||
|
ifeq ($(OS), AIX)
|
||||||
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rc.s
|
||||||
|
m4 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@
|
||||||
|
rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s
|
||||||
|
else
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
else
|
else
|
||||||
$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
|
$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
|
||||||
|
@ -679,7 +960,14 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
|
||||||
|
|
||||||
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
|
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
|
||||||
|
ifeq ($(OS), AIX)
|
||||||
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s
|
||||||
|
m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
|
||||||
|
rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
|
||||||
|
else
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
|
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
|
||||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
|
||||||
|
@ -806,7 +1094,14 @@ $(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DT
|
||||||
$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@
|
$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@
|
||||||
|
|
||||||
$(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND)
|
$(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND)
|
||||||
|
ifeq ($(OS), AIX)
|
||||||
|
$(CC) $(CFLAGS) -E -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o dtrsm_kernel_lt.s
|
||||||
|
m4 dtrsm_kernel_lt.s > dtrsm_kernel_lt_nomacros.s
|
||||||
|
$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ dtrsm_kernel_lt_nomacros.s -o $@
|
||||||
|
rm dtrsm_kernel_lt.s dtrsm_kernel_lt_nomacros.s
|
||||||
|
else
|
||||||
$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@
|
$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
$(KDIR)dtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RN) $(DTRSMDEPEND)
|
$(KDIR)dtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RN) $(DTRSMDEPEND)
|
||||||
$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@
|
$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@
|
||||||
|
@ -1942,7 +2237,7 @@ $(SGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMITCOPY)
|
||||||
|
|
||||||
endif
|
endif
|
||||||
|
|
||||||
$(DGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMONCOPY)
|
$(D<GEMMONCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMONCOPY)
|
||||||
$(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
$(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
||||||
|
|
||||||
$(DGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMOTCOPY)
|
$(DGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMOTCOPY)
|
||||||
|
@ -2046,7 +2341,14 @@ $(KDIR)cgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMM
|
||||||
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@
|
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@
|
||||||
|
|
||||||
$(KDIR)cgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
$(KDIR)cgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
||||||
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@
|
ifeq ($(OS), AIX)
|
||||||
|
$(CC) $(PFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s
|
||||||
|
m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s
|
||||||
|
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@
|
||||||
|
rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s
|
||||||
|
else
|
||||||
|
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
$(KDIR)cgemm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
$(KDIR)cgemm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
||||||
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@
|
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@
|
||||||
|
@ -2085,7 +2387,14 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
|
||||||
$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
|
$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
|
||||||
|
|
||||||
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
|
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
|
||||||
$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
|
ifeq ($(OS), AIX)
|
||||||
|
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s
|
||||||
|
m4 strmmkernel_rn.s > strmm_kernel_rt_nomacros.s
|
||||||
|
$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
|
||||||
|
rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
|
||||||
|
else
|
||||||
|
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
|
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
|
||||||
$(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
|
$(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
|
||||||
|
|
|
@ -102,6 +102,8 @@ CDOTKERNEL = zdot.S
|
||||||
ZDOTKERNEL = zdot.S
|
ZDOTKERNEL = zdot.S
|
||||||
DSDOTKERNEL = dot.S
|
DSDOTKERNEL = dot.S
|
||||||
|
|
||||||
|
DGEMM_BETA = dgemm_beta.S
|
||||||
|
|
||||||
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||||
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||||
|
|
|
@ -0,0 +1,178 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2016, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#define ASSEMBLER
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#define M x0
|
||||||
|
#define N x1
|
||||||
|
#define BETA d0
|
||||||
|
#define LDC x6
|
||||||
|
#define C00 x7
|
||||||
|
|
||||||
|
#define A01 x8
|
||||||
|
#define A02 x9
|
||||||
|
#define A03 x10
|
||||||
|
#define A04 x11
|
||||||
|
|
||||||
|
#define beta0 d11
|
||||||
|
#define betaV0 v11.d[0]
|
||||||
|
#define I x16
|
||||||
|
|
||||||
|
#define size 128
|
||||||
|
|
||||||
|
/**************************************************************************************
|
||||||
|
* Macro definitions
|
||||||
|
**************************************************************************************/
|
||||||
|
|
||||||
|
.macro SAVE_REGS
|
||||||
|
add sp, sp, #-(11 * 16)
|
||||||
|
stp d8, d9, [sp, #(0 * 16)]
|
||||||
|
stp d10, d11, [sp, #(1 * 16)]
|
||||||
|
stp d12, d13, [sp, #(2 * 16)]
|
||||||
|
stp d14, d15, [sp, #(3 * 16)]
|
||||||
|
stp d16, d17, [sp, #(4 * 16)]
|
||||||
|
stp x18, x19, [sp, #(5 * 16)]
|
||||||
|
stp x20, x21, [sp, #(6 * 16)]
|
||||||
|
stp x22, x23, [sp, #(7 * 16)]
|
||||||
|
stp x24, x25, [sp, #(8 * 16)]
|
||||||
|
stp x26, x27, [sp, #(9 * 16)]
|
||||||
|
str x28, [sp, #(10 * 16)]
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro RESTORE_REGS
|
||||||
|
ldp d8, d9, [sp, #(0 * 16)]
|
||||||
|
ldp d10, d11, [sp, #(1 * 16)]
|
||||||
|
ldp d12, d13, [sp, #(2 * 16)]
|
||||||
|
ldp d14, d15, [sp, #(3 * 16)]
|
||||||
|
ldp d16, d17, [sp, #(4 * 16)]
|
||||||
|
ldp x18, x19, [sp, #(5 * 16)]
|
||||||
|
ldp x20, x21, [sp, #(6 * 16)]
|
||||||
|
ldp x22, x23, [sp, #(7 * 16)]
|
||||||
|
ldp x24, x25, [sp, #(8 * 16)]
|
||||||
|
ldp x26, x27, [sp, #(9 * 16)]
|
||||||
|
ldr x28, [sp, #(10 * 16)]
|
||||||
|
add sp, sp, #(11*16)
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/**************************************************************************************
|
||||||
|
* End of macro definitions
|
||||||
|
**************************************************************************************/
|
||||||
|
|
||||||
|
PROLOGUE
|
||||||
|
|
||||||
|
.align 5
|
||||||
|
|
||||||
|
ldr LDC, [sp]
|
||||||
|
SAVE_REGS
|
||||||
|
|
||||||
|
.Lgemm_beta_BEGIN:
|
||||||
|
|
||||||
|
fmov beta0, BETA
|
||||||
|
cmp N, #0
|
||||||
|
ble .Lgemm_beta_L999
|
||||||
|
|
||||||
|
.Lgemm_beta_01:
|
||||||
|
|
||||||
|
lsl LDC, LDC, #3
|
||||||
|
|
||||||
|
.align 5
|
||||||
|
.Lgemm_beta_02:
|
||||||
|
|
||||||
|
mov A01, C00
|
||||||
|
add C00, C00, LDC
|
||||||
|
asr I, M, #4
|
||||||
|
cmp I, #0
|
||||||
|
ble .Lgemm_beta_04
|
||||||
|
add A02, A01, #32
|
||||||
|
add A03, A02, #32
|
||||||
|
add A04, A03, #32
|
||||||
|
|
||||||
|
.align 5
|
||||||
|
.Lgemm_beta_03:
|
||||||
|
|
||||||
|
ldp q0, q1, [A01]
|
||||||
|
ldp q2, q3, [A02]
|
||||||
|
ldp q4, q5, [A03]
|
||||||
|
ldp q6, q7, [A04]
|
||||||
|
|
||||||
|
fmul v0.2d, v0.2d, betaV0
|
||||||
|
fmul v1.2d, v1.2d, betaV0
|
||||||
|
|
||||||
|
fmul v2.2d, v2.2d, betaV0
|
||||||
|
fmul v3.2d, v3.2d, betaV0
|
||||||
|
|
||||||
|
fmul v4.2d, v4.2d, betaV0
|
||||||
|
fmul v5.2d, v5.2d, betaV0
|
||||||
|
|
||||||
|
fmul v6.2d, v6.2d, betaV0
|
||||||
|
fmul v7.2d, v7.2d, betaV0
|
||||||
|
|
||||||
|
st1 {v0.2d, v1.2d}, [A01]
|
||||||
|
add A01, A01, size
|
||||||
|
st1 {v2.2d, v3.2d}, [A02]
|
||||||
|
add A02, A02, size
|
||||||
|
st1 {v4.2d, v5.2d}, [A03]
|
||||||
|
add A03, A03, size
|
||||||
|
st1 {v6.2d, v7.2d}, [A04]
|
||||||
|
add A04, A04, size
|
||||||
|
|
||||||
|
subs I , I , #1
|
||||||
|
bne .Lgemm_beta_03
|
||||||
|
|
||||||
|
.align 5
|
||||||
|
.Lgemm_beta_04:
|
||||||
|
|
||||||
|
and I, M , #15 // M%16
|
||||||
|
cmp I, #0
|
||||||
|
ble .Lgemm_beta_06
|
||||||
|
|
||||||
|
.align 5
|
||||||
|
.Lgemm_beta_05:
|
||||||
|
|
||||||
|
ldr d12, [A01]
|
||||||
|
fmul d12, d12, beta0
|
||||||
|
str d12, [A01]
|
||||||
|
add A01, A01, #8
|
||||||
|
|
||||||
|
subs I , I , #1
|
||||||
|
bne .Lgemm_beta_05
|
||||||
|
|
||||||
|
.align 5
|
||||||
|
.Lgemm_beta_06:
|
||||||
|
|
||||||
|
subs N , N, #1 // N--
|
||||||
|
bne .Lgemm_beta_02
|
||||||
|
|
||||||
|
.align 5
|
||||||
|
.Lgemm_beta_L999:
|
||||||
|
|
||||||
|
mov x0, #0
|
||||||
|
RESTORE_REGS
|
||||||
|
ret
|
||||||
|
|
||||||
|
EPILOGUE
|
|
@ -68,10 +68,10 @@ static float casum_kernel_16 (long n, float *x)
|
||||||
"addi %2, %2, 128 \n\t"
|
"addi %2, %2, 128 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -16 \n\t"
|
"addic. %1, %1, -16 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
|
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"xvabssp 48, 40 \n\t"
|
"xvabssp 48, 40 \n\t"
|
||||||
"xvabssp 49, 41 \n\t"
|
"xvabssp 49, 41 \n\t"
|
||||||
|
@ -108,9 +108,9 @@ static float casum_kernel_16 (long n, float *x)
|
||||||
"xvaddsp 38, 38, %x5 \n\t"
|
"xvaddsp 38, 38, %x5 \n\t"
|
||||||
"xvaddsp 39, 39, %x6 \n\t"
|
"xvaddsp 39, 39, %x6 \n\t"
|
||||||
|
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
|
|
||||||
"xvabssp 48, 40 \n\t"
|
"xvabssp 48, 40 \n\t"
|
||||||
"xvabssp 49, 41 \n\t"
|
"xvabssp 49, 41 \n\t"
|
||||||
|
|
|
@ -62,10 +62,10 @@ static void ccopy_kernel_32 (long n, float *x, float *y)
|
||||||
"addi %2, %2, 128 \n\t"
|
"addi %2, %2, 128 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -32 \n\t"
|
"addic. %1, %1, -32 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
|
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"stxvd2x 32, 0, %3 \n\t"
|
"stxvd2x 32, 0, %3 \n\t"
|
||||||
"stxvd2x 33, %5, %3 \n\t"
|
"stxvd2x 33, %5, %3 \n\t"
|
||||||
|
@ -108,9 +108,9 @@ static void ccopy_kernel_32 (long n, float *x, float *y)
|
||||||
"addi %2, %2, 128 \n\t"
|
"addi %2, %2, 128 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -32 \n\t"
|
"addic. %1, %1, -32 \n\t"
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
|
|
||||||
"stxvd2x 32, 0, %3 \n\t"
|
"stxvd2x 32, 0, %3 \n\t"
|
||||||
"stxvd2x 33, %5, %3 \n\t"
|
"stxvd2x 33, %5, %3 \n\t"
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
* Macros for N=4 and M=8
|
* Macros for N=4 and M=8
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x8', `
|
||||||
|
#else
|
||||||
.macro COPY_4x8
|
.macro COPY_4x8
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
lxvw4x vs33, o16, A0
|
lxvw4x vs33, o16, A0
|
||||||
|
@ -93,13 +97,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvw4x vs46, o32, T1
|
stxvw4x vs46, o32, T1
|
||||||
stxvw4x vs47, o48, T1
|
stxvw4x vs47, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=4 and M=4
|
* Macros for N=4 and M=4
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x4', `
|
||||||
|
#else
|
||||||
.macro COPY_4x4
|
.macro COPY_4x4
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
lxvw4x vs33, o16, A0
|
lxvw4x vs33, o16, A0
|
||||||
|
@ -133,13 +145,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvw4x vs38, o32, T1
|
stxvw4x vs38, o32, T1
|
||||||
stxvw4x vs39, o48, T1
|
stxvw4x vs39, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=4 and M=2
|
* Macros for N=4 and M=2
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x2', `
|
||||||
|
#else
|
||||||
.macro COPY_4x2
|
.macro COPY_4x2
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
addi A0, A0, 16
|
addi A0, A0, 16
|
||||||
|
@ -163,13 +183,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxvw4x vs35, o48, T1
|
stxvw4x vs35, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=4 and M=1
|
* Macros for N=4 and M=1
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x1', `
|
||||||
|
#else
|
||||||
.macro COPY_4x1
|
.macro COPY_4x1
|
||||||
|
#endif
|
||||||
|
|
||||||
lxsspx vs32, o0, A0
|
lxsspx vs32, o0, A0
|
||||||
lxsspx vs33, o4, A0
|
lxsspx vs33, o4, A0
|
||||||
|
@ -207,13 +235,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxsspx vs38, o0, T1
|
stxsspx vs38, o0, T1
|
||||||
stxsspx vs39, o4, T1
|
stxsspx vs39, o4, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=8
|
* Macros for N=2 and M=8
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x8', `
|
||||||
|
#else
|
||||||
.macro COPY_2x8
|
.macro COPY_2x8
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
lxvw4x vs33, o16, A0
|
lxvw4x vs33, o16, A0
|
||||||
|
@ -241,13 +277,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvw4x vs38, o32, T1
|
stxvw4x vs38, o32, T1
|
||||||
stxvw4x vs39, o48, T1
|
stxvw4x vs39, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=4
|
* Macros for N=2 and M=4
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x4', `
|
||||||
|
#else
|
||||||
.macro COPY_2x4
|
.macro COPY_2x4
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
lxvw4x vs33, o16, A0
|
lxvw4x vs33, o16, A0
|
||||||
|
@ -265,13 +309,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvw4x vs34, o32, T1
|
stxvw4x vs34, o32, T1
|
||||||
stxvw4x vs35, o48, T1
|
stxvw4x vs35, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=2
|
* Macros for N=2 and M=2
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x2', `
|
||||||
|
#else
|
||||||
.macro COPY_2x2
|
.macro COPY_2x2
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
addi A0, A0, 16
|
addi A0, A0, 16
|
||||||
|
@ -285,13 +337,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxvw4x vs33, o16, T1
|
stxvw4x vs33, o16, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=1
|
* Macros for N=2 and M=1
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x1', `
|
||||||
|
#else
|
||||||
.macro COPY_2x1
|
.macro COPY_2x1
|
||||||
|
#endif
|
||||||
|
|
||||||
lxsspx vs32, o0, A0
|
lxsspx vs32, o0, A0
|
||||||
lxsspx vs33, o4, A0
|
lxsspx vs33, o4, A0
|
||||||
|
@ -311,13 +371,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxsspx vs34, o0, T1
|
stxsspx vs34, o0, T1
|
||||||
stxsspx vs35, o4, T1
|
stxsspx vs35, o4, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=8
|
* Macros for N=1 and M=8
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x8', `
|
||||||
|
#else
|
||||||
.macro COPY_1x8
|
.macro COPY_1x8
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
lxvw4x vs33, o16, A0
|
lxvw4x vs33, o16, A0
|
||||||
|
@ -332,13 +400,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvw4x vs34, o32, T1
|
stxvw4x vs34, o32, T1
|
||||||
stxvw4x vs35, o48, T1
|
stxvw4x vs35, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=4
|
* Macros for N=1 and M=4
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x4', `
|
||||||
|
#else
|
||||||
.macro COPY_1x4
|
.macro COPY_1x4
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
lxvw4x vs33, o16, A0
|
lxvw4x vs33, o16, A0
|
||||||
|
@ -349,13 +425,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvw4x vs32, o0, T1
|
stxvw4x vs32, o0, T1
|
||||||
stxvw4x vs33, o16, T1
|
stxvw4x vs33, o16, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=2
|
* Macros for N=1 and M=2
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x2', `
|
||||||
|
#else
|
||||||
.macro COPY_1x2
|
.macro COPY_1x2
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
addi A0, A0, 16
|
addi A0, A0, 16
|
||||||
|
@ -364,13 +448,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxvw4x vs32, o0, T1
|
stxvw4x vs32, o0, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=1
|
* Macros for N=1 and M=1
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x1', `
|
||||||
|
#else
|
||||||
.macro COPY_1x1
|
.macro COPY_1x1
|
||||||
|
#endif
|
||||||
|
|
||||||
lxsspx vs32, o0, A0
|
lxsspx vs32, o0, A0
|
||||||
lxsspx vs33, o4, A0
|
lxsspx vs33, o4, A0
|
||||||
|
@ -381,5 +473,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxsspx vs32, o0, T1
|
stxsspx vs32, o0, T1
|
||||||
stxsspx vs33, o4, T1
|
stxsspx vs33, o4, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -56,9 +56,9 @@ static void crot_kernel_8 (long n, float *x, float *y, float c, float s)
|
||||||
"addi %[x_ptr], %[x_ptr], 64 \n\t"
|
"addi %[x_ptr], %[x_ptr], 64 \n\t"
|
||||||
"addi %[y_ptr], %[y_ptr], 64 \n\t"
|
"addi %[y_ptr], %[y_ptr], 64 \n\t"
|
||||||
"addic. %[temp_n], %[temp_n], -8 \n\t"
|
"addic. %[temp_n], %[temp_n], -8 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
".p2align 5 \n\t"
|
".align 5 \n\t"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
"xvmulsp 40, 32, 36 \n\t" // c * x
|
"xvmulsp 40, 32, 36 \n\t" // c * x
|
||||||
"xvmulsp 41, 33, 36 \n\t"
|
"xvmulsp 41, 33, 36 \n\t"
|
||||||
"xvmulsp 42, 34, 36 \n\t"
|
"xvmulsp 42, 34, 36 \n\t"
|
||||||
|
@ -104,8 +104,8 @@ static void crot_kernel_8 (long n, float *x, float *y, float c, float s)
|
||||||
"addi %[x_ptr], %[x_ptr], 128 \n\t"
|
"addi %[x_ptr], %[x_ptr], 128 \n\t"
|
||||||
"addi %[y_ptr], %[y_ptr], 128 \n\t"
|
"addi %[y_ptr], %[y_ptr], 128 \n\t"
|
||||||
"addic. %[temp_n], %[temp_n], -8 \n\t"
|
"addic. %[temp_n], %[temp_n], -8 \n\t"
|
||||||
"bgt 1b \n\t"
|
"bgt one%= \n\t"
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
"xvmulsp 40, 32, 36 \n\t" // c * x
|
"xvmulsp 40, 32, 36 \n\t" // c * x
|
||||||
"xvmulsp 41, 33, 36 \n\t"
|
"xvmulsp 41, 33, 36 \n\t"
|
||||||
"xvmulsp 42, 34, 36 \n\t"
|
"xvmulsp 42, 34, 36 \n\t"
|
||||||
|
|
|
@ -39,8 +39,8 @@ static void cswap_kernel_32 (long n, float *x, float *y)
|
||||||
{
|
{
|
||||||
__asm__
|
__asm__
|
||||||
(
|
(
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"lxvd2x 32, 0, %4 \n\t"
|
"lxvd2x 32, 0, %4 \n\t"
|
||||||
"lxvd2x 33, %5, %4 \n\t"
|
"lxvd2x 33, %5, %4 \n\t"
|
||||||
|
@ -131,7 +131,7 @@ static void cswap_kernel_32 (long n, float *x, float *y)
|
||||||
"addi %4, %4, 128 \n\t"
|
"addi %4, %4, 128 \n\t"
|
||||||
|
|
||||||
"addic. %2, %2, -32 \n\t"
|
"addic. %2, %2, -32 \n\t"
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
|
"#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
|
||||||
:
|
:
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -68,10 +68,10 @@ static double dasum_kernel_16 (long n, double *x)
|
||||||
"addi %2, %2, 128 \n\t"
|
"addi %2, %2, 128 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -16 \n\t"
|
"addic. %1, %1, -16 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
|
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"xvabsdp 48, 40 \n\t"
|
"xvabsdp 48, 40 \n\t"
|
||||||
"xvabsdp 49, 41 \n\t"
|
"xvabsdp 49, 41 \n\t"
|
||||||
|
@ -108,9 +108,9 @@ static double dasum_kernel_16 (long n, double *x)
|
||||||
"xvadddp 38, 38, %x5 \n\t"
|
"xvadddp 38, 38, %x5 \n\t"
|
||||||
"xvadddp 39, 39, %x6 \n\t"
|
"xvadddp 39, 39, %x6 \n\t"
|
||||||
|
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
|
|
||||||
"xvabsdp 48, 40 \n\t"
|
"xvabsdp 48, 40 \n\t"
|
||||||
"xvabsdp 49, 41 \n\t"
|
"xvabsdp 49, 41 \n\t"
|
||||||
|
@ -140,7 +140,7 @@ static double dasum_kernel_16 (long n, double *x)
|
||||||
|
|
||||||
"xvadddp 32, 32, 36 \n\t"
|
"xvadddp 32, 32, 36 \n\t"
|
||||||
|
|
||||||
"xxswapd 33, 32 \n\t"
|
XXSWAPD_S(33,32)
|
||||||
"xsadddp %x0, 32, 33 \n"
|
"xsadddp %x0, 32, 33 \n"
|
||||||
|
|
||||||
"#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n"
|
"#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n"
|
||||||
|
|
|
@ -58,7 +58,7 @@ static void daxpy_kernel_8 (long n, double *x, double *y, double alpha)
|
||||||
|
|
||||||
__asm__
|
__asm__
|
||||||
(
|
(
|
||||||
"xxspltd %x4, %x22, 0 \n\t"
|
XXSPLTD_S(%x4,%x22,0)
|
||||||
|
|
||||||
"dcbt 0, %2 \n\t"
|
"dcbt 0, %2 \n\t"
|
||||||
"dcbt 0, %3 \n\t"
|
"dcbt 0, %3 \n\t"
|
||||||
|
@ -90,10 +90,10 @@ static void daxpy_kernel_8 (long n, double *x, double *y, double alpha)
|
||||||
"addi %3, %3, -64 \n\t"
|
"addi %3, %3, -64 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -16 \n\t"
|
"addic. %1, %1, -16 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
|
|
||||||
".align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"xvmaddadp %x13, %x5, %x4 \n\t"
|
"xvmaddadp %x13, %x5, %x4 \n\t"
|
||||||
"xvmaddadp %x14, %x6, %x4 \n\t"
|
"xvmaddadp %x14, %x6, %x4 \n\t"
|
||||||
|
@ -152,9 +152,9 @@ static void daxpy_kernel_8 (long n, double *x, double *y, double alpha)
|
||||||
"addi %3, %3, -64 \n\t"
|
"addi %3, %3, -64 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -16 \n\t"
|
"addic. %1, %1, -16 \n\t"
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
|
|
||||||
"xvmaddadp %x13, %x5, %x4 \n\t"
|
"xvmaddadp %x13, %x5, %x4 \n\t"
|
||||||
"xvmaddadp %x14, %x6, %x4 \n\t"
|
"xvmaddadp %x14, %x6, %x4 \n\t"
|
||||||
|
|
|
@ -62,10 +62,10 @@ static void dcopy_kernel_32 (long n, double *x, double *y)
|
||||||
"addi %2, %2, 128 \n\t"
|
"addi %2, %2, 128 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -32 \n\t"
|
"addic. %1, %1, -32 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
|
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"stxvd2x 32, 0, %3 \n\t"
|
"stxvd2x 32, 0, %3 \n\t"
|
||||||
"stxvd2x 33, %5, %3 \n\t"
|
"stxvd2x 33, %5, %3 \n\t"
|
||||||
|
@ -108,9 +108,9 @@ static void dcopy_kernel_32 (long n, double *x, double *y)
|
||||||
"addi %2, %2, 128 \n\t"
|
"addi %2, %2, 128 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -32 \n\t"
|
"addic. %1, %1, -32 \n\t"
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
|
|
||||||
"stxvd2x 32, 0, %3 \n\t"
|
"stxvd2x 32, 0, %3 \n\t"
|
||||||
"stxvd2x 33, %5, %3 \n\t"
|
"stxvd2x 33, %5, %3 \n\t"
|
||||||
|
|
|
@ -78,10 +78,10 @@ static double ddot_kernel_8 (long n, double *x, double *y)
|
||||||
"addi %3, %3, 128 \n\t"
|
"addi %3, %3, 128 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -16 \n\t"
|
"addic. %1, %1, -16 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
|
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"xvmaddadp 32, 40, 48 \n\t"
|
"xvmaddadp 32, 40, 48 \n\t"
|
||||||
"lxvd2x 40, 0, %2 \n\t"
|
"lxvd2x 40, 0, %2 \n\t"
|
||||||
|
@ -112,9 +112,9 @@ static double ddot_kernel_8 (long n, double *x, double *y)
|
||||||
"addi %3, %3, 128 \n\t"
|
"addi %3, %3, 128 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -16 \n\t"
|
"addic. %1, %1, -16 \n\t"
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
|
|
||||||
"xvmaddadp 32, 40, 48 \n\t"
|
"xvmaddadp 32, 40, 48 \n\t"
|
||||||
"xvmaddadp 33, 41, 49 \n\t"
|
"xvmaddadp 33, 41, 49 \n\t"
|
||||||
|
@ -135,7 +135,7 @@ static double ddot_kernel_8 (long n, double *x, double *y)
|
||||||
|
|
||||||
"xvadddp 32, 32, 36 \n\t"
|
"xvadddp 32, 32, 36 \n\t"
|
||||||
|
|
||||||
"xxswapd 33, 32 \n\t"
|
XXSWAPD_S(33,32)
|
||||||
|
|
||||||
"xsadddp %x0, 32, 33 \n"
|
"xsadddp %x0, 32, 33 \n"
|
||||||
|
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
* Macros for N=4 and M=16
|
* Macros for N=4 and M=16
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x16', `
|
||||||
|
#else
|
||||||
.macro COPY_4x16
|
.macro COPY_4x16
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs0, o0, A0
|
lxvd2x vs0, o0, A0
|
||||||
lxvd2x vs1, o0, A1
|
lxvd2x vs1, o0, A1
|
||||||
|
@ -180,14 +184,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
addi BO, BO, 128
|
addi BO, BO, 128
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=4 and M=8
|
* Macros for N=4 and M=8
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x8', `
|
||||||
|
#else
|
||||||
.macro COPY_4x8
|
.macro COPY_4x8
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs0, o0, A0
|
lxvd2x vs0, o0, A0
|
||||||
lxvd2x vs1, o16, A0
|
lxvd2x vs1, o16, A0
|
||||||
|
@ -259,14 +271,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
addi BO, BO, 128
|
addi BO, BO, 128
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=4 and M=4
|
* Macros for N=4 and M=4
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x4', `
|
||||||
|
#else
|
||||||
.macro COPY_4x4
|
.macro COPY_4x4
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs0, o0, A0
|
lxvd2x vs0, o0, A0
|
||||||
lxvd2x vs1, o16, A0
|
lxvd2x vs1, o16, A0
|
||||||
|
@ -310,14 +330,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
addi BO, BO, 128
|
addi BO, BO, 128
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=4 and M=2
|
* Macros for N=4 and M=2
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x2', `
|
||||||
|
#else
|
||||||
.macro COPY_4x2
|
.macro COPY_4x2
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs0, o0, A0
|
lxvd2x vs0, o0, A0
|
||||||
addi A0, A0, 16
|
addi A0, A0, 16
|
||||||
|
@ -348,14 +376,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
addi BO, BO, 64
|
addi BO, BO, 64
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=4 and M=1
|
* Macros for N=4 and M=1
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x1', `
|
||||||
|
#else
|
||||||
.macro COPY_4x1
|
.macro COPY_4x1
|
||||||
|
#endif
|
||||||
|
|
||||||
lxsdx vs0, o0, A0
|
lxsdx vs0, o0, A0
|
||||||
addi A0, A0, 8
|
addi A0, A0, 8
|
||||||
|
@ -382,14 +418,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
addi BO, BO, 32
|
addi BO, BO, 32
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=16
|
* Macros for N=2 and M=16
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x16', `
|
||||||
|
#else
|
||||||
.macro COPY_2x16
|
.macro COPY_2x16
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs0, o0, A0
|
lxvd2x vs0, o0, A0
|
||||||
lxvd2x vs1, o16, A0
|
lxvd2x vs1, o16, A0
|
||||||
|
@ -459,14 +503,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
addi BO, BO, 128
|
addi BO, BO, 128
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=8
|
* Macros for N=2 and M=8
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x8', `
|
||||||
|
#else
|
||||||
.macro COPY_2x8
|
.macro COPY_2x8
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs0, o0, A0
|
lxvd2x vs0, o0, A0
|
||||||
lxvd2x vs1, o16, A0
|
lxvd2x vs1, o16, A0
|
||||||
|
@ -506,14 +558,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
addi BO, BO, 128
|
addi BO, BO, 128
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=4
|
* Macros for N=2 and M=4
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x4', `
|
||||||
|
#else
|
||||||
.macro COPY_2x4
|
.macro COPY_2x4
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs0, o0, A0
|
lxvd2x vs0, o0, A0
|
||||||
lxvd2x vs1, o16, A0
|
lxvd2x vs1, o16, A0
|
||||||
|
@ -539,14 +599,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
addi BO, BO, 64
|
addi BO, BO, 64
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=2
|
* Macros for N=2 and M=2
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x2', `
|
||||||
|
#else
|
||||||
.macro COPY_2x2
|
.macro COPY_2x2
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs0, o0, A0
|
lxvd2x vs0, o0, A0
|
||||||
addi A0, A0, 16
|
addi A0, A0, 16
|
||||||
|
@ -565,14 +633,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
addi BO, BO, 32
|
addi BO, BO, 32
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=1
|
* Macros for N=2 and M=1
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x1', `
|
||||||
|
#else
|
||||||
.macro COPY_2x1
|
.macro COPY_2x1
|
||||||
|
#endif
|
||||||
|
|
||||||
lxsdx vs0, o0, A0
|
lxsdx vs0, o0, A0
|
||||||
addi A0, A0, 8
|
addi A0, A0, 8
|
||||||
|
@ -589,14 +665,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
addi BO, BO, 16
|
addi BO, BO, 16
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=16
|
* Macros for N=1 and M=16
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x16', `
|
||||||
|
#else
|
||||||
.macro COPY_1x16
|
.macro COPY_1x16
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs0, o0, A0
|
lxvd2x vs0, o0, A0
|
||||||
lxvd2x vs1, o16, A0
|
lxvd2x vs1, o16, A0
|
||||||
|
@ -622,14 +706,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
addi BO, BO, 64
|
addi BO, BO, 64
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=8
|
* Macros for N=1 and M=8
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x8', `
|
||||||
|
#else
|
||||||
.macro COPY_1x8
|
.macro COPY_1x8
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs0, o0, A0
|
lxvd2x vs0, o0, A0
|
||||||
lxvd2x vs1, o16, A0
|
lxvd2x vs1, o16, A0
|
||||||
|
@ -645,14 +737,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
addi BO, BO, 64
|
addi BO, BO, 64
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=4
|
* Macros for N=1 and M=4
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x4', `
|
||||||
|
#else
|
||||||
.macro COPY_1x4
|
.macro COPY_1x4
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs0, o0, A0
|
lxvd2x vs0, o0, A0
|
||||||
lxvd2x vs1, o16, A0
|
lxvd2x vs1, o16, A0
|
||||||
|
@ -664,14 +764,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
addi BO, BO, 32
|
addi BO, BO, 32
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=2
|
* Macros for N=1 and M=2
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x2', `
|
||||||
|
#else
|
||||||
.macro COPY_1x2
|
.macro COPY_1x2
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs0, o0, A0
|
lxvd2x vs0, o0, A0
|
||||||
addi A0, A0, 16
|
addi A0, A0, 16
|
||||||
|
@ -681,14 +789,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
addi BO, BO, 16
|
addi BO, BO, 16
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=1
|
* Macros for N=1 and M=1
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x1', `
|
||||||
|
#else
|
||||||
.macro COPY_1x1
|
.macro COPY_1x1
|
||||||
|
#endif
|
||||||
|
|
||||||
lxsdx vs0, o0, A0
|
lxsdx vs0, o0, A0
|
||||||
addi A0, A0, 8
|
addi A0, A0, 8
|
||||||
|
@ -698,5 +814,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
addi BO, BO, 8
|
addi BO, BO, 8
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
* Macros for N=4 and M=16
|
* Macros for N=4 and M=16
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x16', `
|
||||||
|
#else
|
||||||
.macro COPY_4x16
|
.macro COPY_4x16
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
lxvd2x vs33, o16, A0
|
lxvd2x vs33, o16, A0
|
||||||
|
@ -140,14 +144,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvd2x vs10, o32, T1
|
stxvd2x vs10, o32, T1
|
||||||
stxvd2x vs11, o48, T1
|
stxvd2x vs11, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=4 and M=8
|
* Macros for N=4 and M=8
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x8', `
|
||||||
|
#else
|
||||||
.macro COPY_4x8
|
.macro COPY_4x8
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
lxvd2x vs33, o16, A0
|
lxvd2x vs33, o16, A0
|
||||||
|
@ -205,14 +217,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvd2x vs46, o32, T1
|
stxvd2x vs46, o32, T1
|
||||||
stxvd2x vs47, o48, T1
|
stxvd2x vs47, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=4 and M=4
|
* Macros for N=4 and M=4
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x4', `
|
||||||
|
#else
|
||||||
.macro COPY_4x4
|
.macro COPY_4x4
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
lxvd2x vs33, o16, A0
|
lxvd2x vs33, o16, A0
|
||||||
|
@ -250,14 +270,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvd2x vs38, o32, T1
|
stxvd2x vs38, o32, T1
|
||||||
stxvd2x vs39, o48, T1
|
stxvd2x vs39, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=4 and M=2
|
* Macros for N=4 and M=2
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x2', `
|
||||||
|
#else
|
||||||
.macro COPY_4x2
|
.macro COPY_4x2
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
addi A0, A0, 16
|
addi A0, A0, 16
|
||||||
|
@ -285,14 +313,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxvd2x vs35, o48, T1
|
stxvd2x vs35, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=4 and M=1
|
* Macros for N=4 and M=1
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x1', `
|
||||||
|
#else
|
||||||
.macro COPY_4x1
|
.macro COPY_4x1
|
||||||
|
#endif
|
||||||
|
|
||||||
lxsdx vs32, o0, A0
|
lxsdx vs32, o0, A0
|
||||||
addi A0, A0, 8
|
addi A0, A0, 8
|
||||||
|
@ -322,14 +358,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxsdx vs35, o8, T1
|
stxsdx vs35, o8, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=16
|
* Macros for N=2 and M=16
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x16', `
|
||||||
|
#else
|
||||||
.macro COPY_2x16
|
.macro COPY_2x16
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
lxvd2x vs33, o16, A0
|
lxvd2x vs33, o16, A0
|
||||||
|
@ -383,14 +427,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvd2x vs46, o32, T1
|
stxvd2x vs46, o32, T1
|
||||||
stxvd2x vs47, o48, T1
|
stxvd2x vs47, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=8
|
* Macros for N=2 and M=8
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x8', `
|
||||||
|
#else
|
||||||
.macro COPY_2x8
|
.macro COPY_2x8
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
lxvd2x vs33, o16, A0
|
lxvd2x vs33, o16, A0
|
||||||
|
@ -420,14 +472,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvd2x vs38, o32, T1
|
stxvd2x vs38, o32, T1
|
||||||
stxvd2x vs39, o48, T1
|
stxvd2x vs39, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=4
|
* Macros for N=2 and M=4
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x4', `
|
||||||
|
#else
|
||||||
.macro COPY_2x4
|
.macro COPY_2x4
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
lxvd2x vs33, o16, A0
|
lxvd2x vs33, o16, A0
|
||||||
|
@ -447,14 +507,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvd2x vs34, o32, T1
|
stxvd2x vs34, o32, T1
|
||||||
stxvd2x vs35, o48, T1
|
stxvd2x vs35, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=2
|
* Macros for N=2 and M=2
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x2', `
|
||||||
|
#else
|
||||||
.macro COPY_2x2
|
.macro COPY_2x2
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
addi A0, A0, 16
|
addi A0, A0, 16
|
||||||
|
@ -470,14 +538,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxvd2x vs33, o16, T1
|
stxvd2x vs33, o16, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=1
|
* Macros for N=2 and M=1
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x1', `
|
||||||
|
#else
|
||||||
.macro COPY_2x1
|
.macro COPY_2x1
|
||||||
|
#endif
|
||||||
|
|
||||||
lxsdx vs32, o0, A0
|
lxsdx vs32, o0, A0
|
||||||
addi A0, A0, 8
|
addi A0, A0, 8
|
||||||
|
@ -493,14 +569,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxsdx vs33, o8, T1
|
stxsdx vs33, o8, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=16
|
* Macros for N=1 and M=16
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x16', `
|
||||||
|
#else
|
||||||
.macro COPY_1x16
|
.macro COPY_1x16
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
lxvd2x vs33, o16, A0
|
lxvd2x vs33, o16, A0
|
||||||
|
@ -528,14 +612,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvd2x vs38, o32, T1
|
stxvd2x vs38, o32, T1
|
||||||
stxvd2x vs39, o48, T1
|
stxvd2x vs39, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=8
|
* Macros for N=1 and M=8
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x8', `
|
||||||
|
#else
|
||||||
.macro COPY_1x8
|
.macro COPY_1x8
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
lxvd2x vs33, o16, A0
|
lxvd2x vs33, o16, A0
|
||||||
|
@ -551,14 +643,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvd2x vs34, o32, T1
|
stxvd2x vs34, o32, T1
|
||||||
stxvd2x vs35, o48, T1
|
stxvd2x vs35, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=4
|
* Macros for N=1 and M=4
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x4', `
|
||||||
|
#else
|
||||||
.macro COPY_1x4
|
.macro COPY_1x4
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
lxvd2x vs33, o16, A0
|
lxvd2x vs33, o16, A0
|
||||||
|
@ -570,14 +670,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvd2x vs32, o0, T1
|
stxvd2x vs32, o0, T1
|
||||||
stxvd2x vs33, o16, T1
|
stxvd2x vs33, o16, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=2
|
* Macros for N=1 and M=2
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x2', `
|
||||||
|
#else
|
||||||
.macro COPY_1x2
|
.macro COPY_1x2
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
addi A0, A0, 16
|
addi A0, A0, 16
|
||||||
|
@ -587,14 +695,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxvd2x vs32, o0, T1
|
stxvd2x vs32, o0, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=1
|
* Macros for N=1 and M=1
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x1', `
|
||||||
|
#else
|
||||||
.macro COPY_1x1
|
.macro COPY_1x1
|
||||||
|
#endif
|
||||||
|
|
||||||
lxsdx vs32, o0, A0
|
lxsdx vs32, o0, A0
|
||||||
addi A0, A0, 8
|
addi A0, A0, 8
|
||||||
|
@ -604,5 +720,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxsdx vs32, o0, T1
|
stxsdx vs32, o0, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -46,7 +46,7 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
|
||||||
(
|
(
|
||||||
"lxvd2x 34, 0, %10 \n\t" // x0, x1
|
"lxvd2x 34, 0, %10 \n\t" // x0, x1
|
||||||
"lxvd2x 35, %11, %10 \n\t" // x2, x3
|
"lxvd2x 35, %11, %10 \n\t" // x2, x3
|
||||||
"xxspltd 32, %x9, 0 \n\t" // alpha, alpha
|
XXSPLTD_S(32,%x9,0) // alpha, alpha
|
||||||
|
|
||||||
"sldi %6, %13, 3 \n\t" // lda * sizeof (double)
|
"sldi %6, %13, 3 \n\t" // lda * sizeof (double)
|
||||||
|
|
||||||
|
@ -56,10 +56,10 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
|
||||||
"add %4, %3, %6 \n\t" // a0 = ap, a1 = a0 + lda
|
"add %4, %3, %6 \n\t" // a0 = ap, a1 = a0 + lda
|
||||||
"add %6, %6, %6 \n\t" // 2 * lda
|
"add %6, %6, %6 \n\t" // 2 * lda
|
||||||
|
|
||||||
"xxspltd 32, 34, 0 \n\t" // x0 * alpha, x0 * alpha
|
XXSPLTD_S(32,34,0) // x0 * alpha, x0 * alpha
|
||||||
"xxspltd 33, 34, 1 \n\t" // x1 * alpha, x1 * alpha
|
XXSPLTD_S(33,34,1) // x1 * alpha, x1 * alpha
|
||||||
"xxspltd 34, 35, 0 \n\t" // x2 * alpha, x2 * alpha
|
XXSPLTD_S(34,35,0) // x2 * alpha, x2 * alpha
|
||||||
"xxspltd 35, 35, 1 \n\t" // x3 * alpha, x3 * alpha
|
XXSPLTD_S(35,35,1) // x3 * alpha, x3 * alpha
|
||||||
|
|
||||||
"add %5, %3, %6 \n\t" // a2 = a0 + 2 * lda
|
"add %5, %3, %6 \n\t" // a2 = a0 + 2 * lda
|
||||||
"add %6, %4, %6 \n\t" // a3 = a1 + 2 * lda
|
"add %6, %4, %6 \n\t" // a3 = a1 + 2 * lda
|
||||||
|
@ -89,10 +89,10 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
|
||||||
"addi %6, %6, 32 \n\t"
|
"addi %6, %6, 32 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -4 \n\t"
|
"addic. %1, %1, -4 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
|
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"lxvd2x 36, 0, %2 \n\t" // y0, y1
|
"lxvd2x 36, 0, %2 \n\t" // y0, y1
|
||||||
"lxvd2x 37, %11, %2 \n\t" // y2, y3
|
"lxvd2x 37, %11, %2 \n\t" // y2, y3
|
||||||
|
@ -131,7 +131,7 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
|
||||||
"addi %2, %2, 32 \n\t"
|
"addi %2, %2, 32 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -4 \n\t"
|
"addic. %1, %1, -4 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
|
|
||||||
|
|
||||||
"lxvd2x 36, 0, %2 \n\t" // y0, y1
|
"lxvd2x 36, 0, %2 \n\t" // y0, y1
|
||||||
|
@ -171,7 +171,7 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
|
||||||
"addi %2, %2, 32 \n\t"
|
"addi %2, %2, 32 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -4 \n\t"
|
"addic. %1, %1, -4 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
|
|
||||||
|
|
||||||
"lxvd2x 36, 0, %2 \n\t" // y0, y1
|
"lxvd2x 36, 0, %2 \n\t" // y0, y1
|
||||||
|
@ -211,7 +211,7 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
|
||||||
"addi %2, %2, 32 \n\t"
|
"addi %2, %2, 32 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -4 \n\t"
|
"addic. %1, %1, -4 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
|
|
||||||
|
|
||||||
"lxvd2x 36, 0, %2 \n\t" // y0, y1
|
"lxvd2x 36, 0, %2 \n\t" // y0, y1
|
||||||
|
@ -251,9 +251,9 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
|
||||||
"addi %2, %2, 32 \n\t"
|
"addi %2, %2, 32 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -4 \n\t"
|
"addic. %1, %1, -4 \n\t"
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
|
|
||||||
"lxvd2x 36, 0, %2 \n\t" // y0, y1
|
"lxvd2x 36, 0, %2 \n\t" // y0, y1
|
||||||
"lxvd2x 37, %11, %2 \n\t" // y2, y3
|
"lxvd2x 37, %11, %2 \n\t" // y2, y3
|
||||||
|
|
|
@ -93,11 +93,11 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
|
||||||
"li %[off],32 \n\t"
|
"li %[off],32 \n\t"
|
||||||
|
|
||||||
|
|
||||||
"ble- 2f \n\t"
|
"ble- two%= \n\t"
|
||||||
|
|
||||||
//--------------------------------------------------
|
//--------------------------------------------------
|
||||||
".p2align 5 \n\t"
|
".align 5 \n\t"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
"xvmaddadp 34,36,32 \n\t"
|
"xvmaddadp 34,36,32 \n\t"
|
||||||
"xvmaddadp 35,38,32 \n\t"
|
"xvmaddadp 35,38,32 \n\t"
|
||||||
"addi %[off2], %[off2],32 \n\t"
|
"addi %[off2], %[off2],32 \n\t"
|
||||||
|
@ -137,7 +137,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
|
||||||
"lxvd2x 49, %[a6], %[off2] \n\t"
|
"lxvd2x 49, %[a6], %[off2] \n\t"
|
||||||
"lxvd2x 51, %[a7], %[off2] \n\t"
|
"lxvd2x 51, %[a7], %[off2] \n\t"
|
||||||
"lxvd2x 33, %[x], %[off2] \n\t"
|
"lxvd2x 33, %[x], %[off2] \n\t"
|
||||||
"ble- 2f \n\t"
|
"ble- two%= \n\t"
|
||||||
"xvmaddadp 34,36,32 \n\t"
|
"xvmaddadp 34,36,32 \n\t"
|
||||||
"xvmaddadp 35,38,32 \n\t"
|
"xvmaddadp 35,38,32 \n\t"
|
||||||
"addi %[off2], %[off2],32 \n\t"
|
"addi %[off2], %[off2],32 \n\t"
|
||||||
|
@ -177,7 +177,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
|
||||||
"lxvd2x 49, %[a6], %[off2] \n\t"
|
"lxvd2x 49, %[a6], %[off2] \n\t"
|
||||||
"lxvd2x 51, %[a7], %[off2] \n\t"
|
"lxvd2x 51, %[a7], %[off2] \n\t"
|
||||||
"lxvd2x 33, %[x], %[off2] \n\t"
|
"lxvd2x 33, %[x], %[off2] \n\t"
|
||||||
"ble- 2f \n\t"
|
"ble- two%= \n\t"
|
||||||
"xvmaddadp 34,36,32 \n\t"
|
"xvmaddadp 34,36,32 \n\t"
|
||||||
"xvmaddadp 35,38,32 \n\t"
|
"xvmaddadp 35,38,32 \n\t"
|
||||||
#if defined(PREFETCH)
|
#if defined(PREFETCH)
|
||||||
|
@ -229,7 +229,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
|
||||||
|
|
||||||
"lxvd2x 33, %[x], %[off2] \n\t"
|
"lxvd2x 33, %[x], %[off2] \n\t"
|
||||||
"addic. %[n],%[n],-4 \n\t"
|
"addic. %[n],%[n],-4 \n\t"
|
||||||
"ble- 2f \n\t"
|
"ble- two%= \n\t"
|
||||||
|
|
||||||
"addi %[off2], %[off2],32 \n\t"
|
"addi %[off2], %[off2],32 \n\t"
|
||||||
#if defined(PREFETCH)
|
#if defined(PREFETCH)
|
||||||
|
@ -288,9 +288,9 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
|
||||||
#if defined(PREFETCH)
|
#if defined(PREFETCH)
|
||||||
"dcbt %[temp],%[x] \n\t"
|
"dcbt %[temp],%[x] \n\t"
|
||||||
#endif
|
#endif
|
||||||
"bgt+ 1b \n\t"
|
"bgt+ one%= \n\t"
|
||||||
".p2align 5 \n\t"
|
".align 5 \n\t"
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
//--------------------------------------------
|
//--------------------------------------------
|
||||||
|
|
||||||
"xvmaddadp 34,36,32 \n\t"
|
"xvmaddadp 34,36,32 \n\t"
|
||||||
|
@ -301,7 +301,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
|
||||||
"xvmaddadp 7,46,32 \n\t"
|
"xvmaddadp 7,46,32 \n\t"
|
||||||
"xvmaddadp 8,48,32 \n\t"
|
"xvmaddadp 8,48,32 \n\t"
|
||||||
"xvmaddadp 9,50,32 \n\t"
|
"xvmaddadp 9,50,32 \n\t"
|
||||||
"xxspltd 36, %x[alpha], 0 \n\t"
|
XXSPLTD_S(36,%x[alpha],0)
|
||||||
"xvmaddadp 34,37,33 \n\t"
|
"xvmaddadp 34,37,33 \n\t"
|
||||||
"xvmaddadp 35,39,33 \n\t"
|
"xvmaddadp 35,39,33 \n\t"
|
||||||
"xvmaddadp 4,41,33 \n\t"
|
"xvmaddadp 4,41,33 \n\t"
|
||||||
|
@ -322,21 +322,21 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
"xxmrgld 42,34,35 \n\t"
|
XXMRGLD_S(42,34,35)
|
||||||
"xxmrghd 43,34,35 \n\t"
|
XXMRGHD_S(43,34,35)
|
||||||
|
|
||||||
"xxmrgld 44,4,5 \n\t"
|
XXMRGLD_S(44,4,5)
|
||||||
"xxmrghd 45,4,5 \n\t"
|
XXMRGHD_S(45,4,5)
|
||||||
|
|
||||||
"xvadddp 42,42,43 \n\t"
|
"xvadddp 42,42,43 \n\t"
|
||||||
|
|
||||||
"xxmrgld 46,6,7 \n\t"
|
XXMRGLD_S(46,6,7)
|
||||||
"xxmrghd 47,6,7 \n\t"
|
XXMRGHD_S(47,6,7)
|
||||||
|
|
||||||
"xvadddp 44,44,45 \n\t"
|
"xvadddp 44,44,45 \n\t"
|
||||||
|
|
||||||
"xxmrgld 48,8,9 \n\t"
|
XXMRGLD_S(48,8,9)
|
||||||
"xxmrghd 49,8,9 \n\t"
|
XXMRGHD_S(49,8,9)
|
||||||
|
|
||||||
"xvadddp 46,46,47 \n\t"
|
"xvadddp 46,46,47 \n\t"
|
||||||
|
|
||||||
|
|
|
@ -51,8 +51,8 @@ static void drot_kernel_16 (long n, double *x, double *y, double c, double s)
|
||||||
|
|
||||||
__asm__
|
__asm__
|
||||||
(
|
(
|
||||||
"xxspltd 36, %x13, 0 \n\t" // load c to both dwords
|
XXSPLTD_S(36,%x13,0) // load c to both dwords
|
||||||
"xxspltd 37, %x14, 0 \n\t" // load s to both dwords
|
XXSPLTD_S(37,%x14,0) // load s to both dwords
|
||||||
|
|
||||||
"lxvd2x 32, 0, %3 \n\t" // load x
|
"lxvd2x 32, 0, %3 \n\t" // load x
|
||||||
"lxvd2x 33, %15, %3 \n\t"
|
"lxvd2x 33, %15, %3 \n\t"
|
||||||
|
@ -68,10 +68,10 @@ static void drot_kernel_16 (long n, double *x, double *y, double c, double s)
|
||||||
"addi %4, %4, 64 \n\t"
|
"addi %4, %4, 64 \n\t"
|
||||||
|
|
||||||
"addic. %2, %2, -8 \n\t"
|
"addic. %2, %2, -8 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
|
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"xvmuldp 40, 32, 36 \n\t" // c * x
|
"xvmuldp 40, 32, 36 \n\t" // c * x
|
||||||
"xvmuldp 41, 33, 36 \n\t"
|
"xvmuldp 41, 33, 36 \n\t"
|
||||||
|
@ -135,9 +135,9 @@ static void drot_kernel_16 (long n, double *x, double *y, double c, double s)
|
||||||
"addi %4, %4, 128 \n\t"
|
"addi %4, %4, 128 \n\t"
|
||||||
|
|
||||||
"addic. %2, %2, -8 \n\t"
|
"addic. %2, %2, -8 \n\t"
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
|
|
||||||
"xvmuldp 40, 32, 36 \n\t" // c * x
|
"xvmuldp 40, 32, 36 \n\t" // c * x
|
||||||
"xvmuldp 41, 33, 36 \n\t"
|
"xvmuldp 41, 33, 36 \n\t"
|
||||||
|
|
|
@ -41,7 +41,7 @@ static void dscal_kernel_8 (long n, double *x, double alpha)
|
||||||
(
|
(
|
||||||
"dcbt 0, %2 \n\t"
|
"dcbt 0, %2 \n\t"
|
||||||
|
|
||||||
"xxspltd %x3, %x3, 0 \n\t"
|
XXSPLTD_S(%x3,%x3,0)
|
||||||
|
|
||||||
"lxvd2x 32, 0, %2 \n\t"
|
"lxvd2x 32, 0, %2 \n\t"
|
||||||
"lxvd2x 33, %4, %2 \n\t"
|
"lxvd2x 33, %4, %2 \n\t"
|
||||||
|
@ -55,10 +55,10 @@ static void dscal_kernel_8 (long n, double *x, double alpha)
|
||||||
"addi %2, %2, 128 \n\t"
|
"addi %2, %2, 128 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -16 \n\t"
|
"addic. %1, %1, -16 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
|
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"xvmuldp 40, 32, %x3 \n\t"
|
"xvmuldp 40, 32, %x3 \n\t"
|
||||||
"xvmuldp 41, 33, %x3 \n\t"
|
"xvmuldp 41, 33, %x3 \n\t"
|
||||||
|
@ -91,9 +91,9 @@ static void dscal_kernel_8 (long n, double *x, double alpha)
|
||||||
"addi %2, %2, 256 \n\t"
|
"addi %2, %2, 256 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -16 \n\t"
|
"addic. %1, %1, -16 \n\t"
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
|
|
||||||
"xvmuldp 40, 32, %x3 \n\t"
|
"xvmuldp 40, 32, %x3 \n\t"
|
||||||
"xvmuldp 41, 33, %x3 \n\t"
|
"xvmuldp 41, 33, %x3 \n\t"
|
||||||
|
@ -146,8 +146,8 @@ static void dscal_kernel_8_zero (long n, double *x)
|
||||||
(
|
(
|
||||||
"xxlxor %x3, %x3, %x3 \n\t"
|
"xxlxor %x3, %x3, %x3 \n\t"
|
||||||
|
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"stxvd2x %x3, 0, %2 \n\t"
|
"stxvd2x %x3, 0, %2 \n\t"
|
||||||
"stxvd2x %x3, %4, %2 \n\t"
|
"stxvd2x %x3, %4, %2 \n\t"
|
||||||
|
@ -161,7 +161,7 @@ static void dscal_kernel_8_zero (long n, double *x)
|
||||||
"addi %2, %2, 128 \n\t"
|
"addi %2, %2, 128 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -16 \n\t"
|
"addic. %1, %1, -16 \n\t"
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"#n=%1 x=%0=%2 t0=%x3 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10"
|
"#n=%1 x=%0=%2 t0=%x3 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10"
|
||||||
:
|
:
|
||||||
|
|
|
@ -39,8 +39,8 @@ static void dswap_kernel_32 (long n, double *x, double *y)
|
||||||
{
|
{
|
||||||
__asm__
|
__asm__
|
||||||
(
|
(
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"lxvd2x 32, 0, %4 \n\t"
|
"lxvd2x 32, 0, %4 \n\t"
|
||||||
"lxvd2x 33, %5, %4 \n\t"
|
"lxvd2x 33, %5, %4 \n\t"
|
||||||
|
@ -131,7 +131,7 @@ static void dswap_kernel_32 (long n, double *x, double *y)
|
||||||
"addi %4, %4, 128 \n\t"
|
"addi %4, %4, 128 \n\t"
|
||||||
|
|
||||||
"addic. %2, %2, -32 \n\t"
|
"addic. %2, %2, -32 \n\t"
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
|
"#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
|
||||||
:
|
:
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -58,8 +58,8 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
|
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
|
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
|
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
|
"lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
|
"lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
|
||||||
|
|
||||||
"xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8
|
"xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8
|
||||||
"vaddudm 9,8,%[adder] \n\t" //{3,2} vs41
|
"vaddudm 9,8,%[adder] \n\t" //{3,2} vs41
|
||||||
|
@ -69,7 +69,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
"vaddudm 11,10,%[adder] \n\t" //{7,6} vs43
|
"vaddudm 11,10,%[adder] \n\t" //{7,6} vs43
|
||||||
"xxlxor 39,39,39 \n\t" // vs39 vec_max_value
|
"xxlxor 39,39,39 \n\t" // vs39 vec_max_value
|
||||||
"vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4
|
"vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4
|
||||||
"xxspltd 36,36,0 \n\t"
|
XXSPLTD_S(36,36,0)
|
||||||
|
|
||||||
"xvabsdp 44, 44 \n\t"
|
"xvabsdp 44, 44 \n\t"
|
||||||
"xvabsdp 45, 45 \n\t"
|
"xvabsdp 45, 45 \n\t"
|
||||||
|
@ -77,21 +77,21 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
"xvabsdp 47, 47 \n\t"
|
"xvabsdp 47, 47 \n\t"
|
||||||
"xvabsdp 48, 48 \n\t"
|
"xvabsdp 48, 48 \n\t"
|
||||||
"xvabsdp 49, 49 \n\t"
|
"xvabsdp 49, 49 \n\t"
|
||||||
"xvabsdp 50, 50 \n\t"
|
"xvabsdp 6, 6 \n\t"
|
||||||
"xvabsdp 51, 51 \n\t"
|
"xvabsdp 7, 7 \n\t"
|
||||||
|
|
||||||
//jump first half forward
|
//jump first half forward
|
||||||
"b 2f \n\t"
|
"b two%= \n\t"
|
||||||
|
|
||||||
//===================================================================
|
//===================================================================
|
||||||
|
|
||||||
".p2align 5 \n\t"
|
".align 5 \n\t"
|
||||||
|
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
"xvcmpgtdp 2,45,44 \n\t "
|
"xvcmpgtdp 2,45,44 \n\t "
|
||||||
"xvcmpgtdp 3,47,46 \n\t "
|
"xvcmpgtdp 3,47,46 \n\t "
|
||||||
"xvcmpgtdp 4,49,48 \n\t "
|
"xvcmpgtdp 4,49,48 \n\t "
|
||||||
"xvcmpgtdp 5,51,50 \n\t"
|
"xvcmpgtdp 5,7,6 \n\t"
|
||||||
|
|
||||||
"xxsel 32,40,41,2 \n\t"
|
"xxsel 32,40,41,2 \n\t"
|
||||||
"xxsel 0,44,45,2 \n\t"
|
"xxsel 0,44,45,2 \n\t"
|
||||||
|
@ -100,7 +100,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
"xxsel 34,40,41,4 \n\t"
|
"xxsel 34,40,41,4 \n\t"
|
||||||
"xxsel 45,48,49,4 \n\t"
|
"xxsel 45,48,49,4 \n\t"
|
||||||
"xxsel 35,42,43,5 \n\t"
|
"xxsel 35,42,43,5 \n\t"
|
||||||
"xxsel 47,50,51,5 \n\t"
|
"xxsel 47,6,7,5 \n\t"
|
||||||
|
|
||||||
"xvcmpgtdp 2, 1,0 \n\t"
|
"xvcmpgtdp 2, 1,0 \n\t"
|
||||||
"xvcmpgtdp 3,47, 45 \n\t"
|
"xvcmpgtdp 3,47, 45 \n\t"
|
||||||
|
@ -134,8 +134,8 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
|
|
||||||
"vaddudm 1,1,5 \n\t" // get real index for first bigger
|
"vaddudm 1,1,5 \n\t" // get real index for first bigger
|
||||||
|
|
||||||
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
|
"lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
|
"lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
|
||||||
|
|
||||||
//compare with previous to get vec_max_index(v6 | vs38 ) and vec_max_value (vs39)
|
//compare with previous to get vec_max_index(v6 | vs38 ) and vec_max_value (vs39)
|
||||||
"xvcmpgtdp 2, 3,39 \n\t"
|
"xvcmpgtdp 2, 3,39 \n\t"
|
||||||
|
@ -155,16 +155,16 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
|
|
||||||
"xvabsdp 48, 48 \n\t"
|
"xvabsdp 48, 48 \n\t"
|
||||||
"xvabsdp 49, 49 \n\t"
|
"xvabsdp 49, 49 \n\t"
|
||||||
"xvabsdp 50, 50 \n\t"
|
"xvabsdp 6, 6 \n\t"
|
||||||
"xvabsdp 51, 51 \n\t"
|
"xvabsdp 7, 7 \n\t"
|
||||||
|
|
||||||
//<-----------jump here from first load
|
//<-----------jump here from first load
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
|
|
||||||
"xvcmpgtdp 2,45,44 \n\t "
|
"xvcmpgtdp 2,45,44 \n\t "
|
||||||
"xvcmpgtdp 3,47,46 \n\t "
|
"xvcmpgtdp 3,47,46 \n\t "
|
||||||
"xvcmpgtdp 4,49,48 \n\t "
|
"xvcmpgtdp 4,49,48 \n\t "
|
||||||
"xvcmpgtdp 5,51,50 \n\t"
|
"xvcmpgtdp 5,7,6 \n\t"
|
||||||
|
|
||||||
"xxsel 32,40,41,2 \n\t"
|
"xxsel 32,40,41,2 \n\t"
|
||||||
"xxsel 0,44,45,2 \n\t"
|
"xxsel 0,44,45,2 \n\t"
|
||||||
|
@ -173,7 +173,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
"xxsel 34,40,41,4 \n\t"
|
"xxsel 34,40,41,4 \n\t"
|
||||||
"xxsel 45,48,49,4 \n\t"
|
"xxsel 45,48,49,4 \n\t"
|
||||||
"xxsel 35,42,43,5 \n\t"
|
"xxsel 35,42,43,5 \n\t"
|
||||||
"xxsel 47,50,51,5 \n\t"
|
"xxsel 47,6,7,5 \n\t"
|
||||||
|
|
||||||
"xvcmpgtdp 2, 1,0 \n\t"
|
"xvcmpgtdp 2, 1,0 \n\t"
|
||||||
"xvcmpgtdp 3,47, 45 \n\t"
|
"xvcmpgtdp 3,47, 45 \n\t"
|
||||||
|
@ -203,8 +203,8 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
|
|
||||||
"vaddudm 1,1,5 \n\t" // get real index for first bigger
|
"vaddudm 1,1,5 \n\t" // get real index for first bigger
|
||||||
|
|
||||||
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
|
"lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
|
"lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -226,21 +226,21 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
|
|
||||||
"xvabsdp 48, 48 \n\t"
|
"xvabsdp 48, 48 \n\t"
|
||||||
"xvabsdp 49, 49 \n\t"
|
"xvabsdp 49, 49 \n\t"
|
||||||
"xvabsdp 50, 50 \n\t"
|
"xvabsdp 6, 6 \n\t"
|
||||||
"xvabsdp 51, 51 \n\t"
|
"xvabsdp 7, 7 \n\t"
|
||||||
|
|
||||||
//decrement n
|
//decrement n
|
||||||
"addic. %[n], %[n], -32 \n\t"
|
"addic. %[n], %[n], -32 \n\t"
|
||||||
|
|
||||||
//Loop back if >0
|
//Loop back if >0
|
||||||
"bgt+ 1b \n\t"
|
"bgt+ one%= \n\t"
|
||||||
|
|
||||||
//==============================================================================
|
//==============================================================================
|
||||||
|
|
||||||
"xvcmpgtdp 2,45,44 \n\t "
|
"xvcmpgtdp 2,45,44 \n\t "
|
||||||
"xvcmpgtdp 3,47,46 \n\t "
|
"xvcmpgtdp 3,47,46 \n\t "
|
||||||
"xvcmpgtdp 4,49,48 \n\t "
|
"xvcmpgtdp 4,49,48 \n\t "
|
||||||
"xvcmpgtdp 5,51,50 \n\t"
|
"xvcmpgtdp 5,7,6 \n\t"
|
||||||
|
|
||||||
"xxsel 32,40,41,2 \n\t"
|
"xxsel 32,40,41,2 \n\t"
|
||||||
"xxsel 0,44,45,2 \n\t"
|
"xxsel 0,44,45,2 \n\t"
|
||||||
|
@ -249,7 +249,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
"xxsel 34,40,41,4 \n\t"
|
"xxsel 34,40,41,4 \n\t"
|
||||||
"xxsel 45,48,49,4 \n\t"
|
"xxsel 45,48,49,4 \n\t"
|
||||||
"xxsel 35,42,43,5 \n\t"
|
"xxsel 35,42,43,5 \n\t"
|
||||||
"xxsel 47,50,51,5 \n\t"
|
"xxsel 47,6,7,5 \n\t"
|
||||||
|
|
||||||
"xvcmpgtdp 2, 1,0 \n\t"
|
"xvcmpgtdp 2, 1,0 \n\t"
|
||||||
"xvcmpgtdp 3,47, 45 \n\t"
|
"xvcmpgtdp 3,47, 45 \n\t"
|
||||||
|
@ -276,28 +276,28 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
|
|
||||||
///////extract max value and max index from vector
|
///////extract max value and max index from vector
|
||||||
|
|
||||||
"xxspltd 32,38,1 \n\t"
|
XXSPLTD_S(32,38,1)
|
||||||
"xxspltd 40,39,1 \n\t"
|
XXSPLTD_S(40,39,1)
|
||||||
"xvcmpeqdp. 2, 40,39 \n\t"
|
"xvcmpeqdp. 2, 40,39 \n\t"
|
||||||
|
|
||||||
//cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
|
//cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
|
||||||
//0b001110=14
|
//0b001110=14
|
||||||
"bc 14,24, 3f \n\t"
|
"bc 14,24, three%= \n\t"
|
||||||
"xvcmpgtdp 4, 40,39 \n\t"
|
"xvcmpgtdp 4, 40,39 \n\t"
|
||||||
"xxsel 0,39,40,4 \n\t"
|
"xxsel 0,39,40,4 \n\t"
|
||||||
"xxsel 1,38,32,4 \n\t"
|
"xxsel 1,38,32,4 \n\t"
|
||||||
"stxsdx 0,0,%[ptr_maxf] \n\t"
|
"stxsdx 0,0,%[ptr_maxf] \n\t"
|
||||||
"b 4f \n\t"
|
"b four%= \n\t"
|
||||||
|
|
||||||
"3: \n\t"
|
"three%=: \n\t"
|
||||||
//if elements value are equal then choose minimum index
|
//if elements value are equal then choose minimum index
|
||||||
"xxspltd 0,40,0 \n\t"
|
XXSPLTD_S(0,40,0)
|
||||||
"vminud 0,0,6 \n\t" //vs32 vs38
|
"vminud 0,0,6 \n\t" //vs32 vs38
|
||||||
"xxlor 1,32,32 \n\t"
|
"xxlor 1,32,32 \n\t"
|
||||||
"stxsdx 0,0,%[ptr_maxf] \n\t"
|
"stxsdx 0,0,%[ptr_maxf] \n\t"
|
||||||
|
|
||||||
|
|
||||||
"4: \n\t"
|
"four%=: \n\t"
|
||||||
"mfvsrd %[index],1 \n\t"
|
"mfvsrd %[index],1 \n\t"
|
||||||
|
|
||||||
: [maxf] "=m"(*maxf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n)
|
: [maxf] "=m"(*maxf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n)
|
||||||
|
@ -306,7 +306,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
[i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112),
|
[i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112),
|
||||||
[start] "v"(start), [adder] "v"(temp_add_index)
|
[start] "v"(start), [adder] "v"(temp_add_index)
|
||||||
: "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36",
|
: "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36",
|
||||||
"vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51"
|
"vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs6", "vs7"
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -58,8 +58,8 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
|
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
|
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
|
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
|
"lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
|
"lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
|
||||||
|
|
||||||
"xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8
|
"xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8
|
||||||
"vaddudm 9,8, %[adder] \n\t" //{3,2} vs41
|
"vaddudm 9,8, %[adder] \n\t" //{3,2} vs41
|
||||||
|
@ -69,7 +69,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
"vaddudm 11,10,%[adder] \n\t" //{7,6} vs43
|
"vaddudm 11,10,%[adder] \n\t" //{7,6} vs43
|
||||||
"lxvdsx 39,0,%[ptr_minf] \n\t" // vs39 vec_min_value
|
"lxvdsx 39,0,%[ptr_minf] \n\t" // vs39 vec_min_value
|
||||||
"vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4
|
"vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4
|
||||||
"xxspltd 36,36,0 \n\t"
|
XXSPLTD_S(36,36,0)
|
||||||
"xvabsdp 39, 39 \n\t"
|
"xvabsdp 39, 39 \n\t"
|
||||||
|
|
||||||
"xvabsdp 44, 44 \n\t"
|
"xvabsdp 44, 44 \n\t"
|
||||||
|
@ -78,21 +78,21 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
"xvabsdp 47, 47 \n\t"
|
"xvabsdp 47, 47 \n\t"
|
||||||
"xvabsdp 48, 48 \n\t"
|
"xvabsdp 48, 48 \n\t"
|
||||||
"xvabsdp 49, 49 \n\t"
|
"xvabsdp 49, 49 \n\t"
|
||||||
"xvabsdp 50, 50 \n\t"
|
"xvabsdp 6, 6 \n\t"
|
||||||
"xvabsdp 51, 51 \n\t"
|
"xvabsdp 7, 7 \n\t"
|
||||||
|
|
||||||
//jump first half forward
|
//jump first half forward
|
||||||
"b 2f \n\t"
|
"b two%= \n\t"
|
||||||
|
|
||||||
//===================================================================
|
//===================================================================
|
||||||
|
|
||||||
".p2align 5 \n\t"
|
".align 5 \n\t"
|
||||||
|
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
"xvcmpgtdp 2,44,45 \n\t "
|
"xvcmpgtdp 2,44,45 \n\t "
|
||||||
"xvcmpgtdp 3,46,47 \n\t "
|
"xvcmpgtdp 3,46,47 \n\t "
|
||||||
"xvcmpgtdp 4,48,49 \n\t "
|
"xvcmpgtdp 4,48,49 \n\t "
|
||||||
"xvcmpgtdp 5,50,51 \n\t"
|
"xvcmpgtdp 5,6,7 \n\t"
|
||||||
|
|
||||||
"xxsel 32,40,41,2 \n\t"
|
"xxsel 32,40,41,2 \n\t"
|
||||||
"xxsel 0,44,45,2 \n\t"
|
"xxsel 0,44,45,2 \n\t"
|
||||||
|
@ -101,7 +101,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
"xxsel 34,40,41,4 \n\t"
|
"xxsel 34,40,41,4 \n\t"
|
||||||
"xxsel 45,48,49,4 \n\t"
|
"xxsel 45,48,49,4 \n\t"
|
||||||
"xxsel 35,42,43,5 \n\t"
|
"xxsel 35,42,43,5 \n\t"
|
||||||
"xxsel 47,50,51,5 \n\t"
|
"xxsel 47,6,7,5 \n\t"
|
||||||
|
|
||||||
"xvcmpgtdp 2,0, 1 \n\t"
|
"xvcmpgtdp 2,0, 1 \n\t"
|
||||||
"xvcmpgtdp 3, 45,47 \n\t"
|
"xvcmpgtdp 3, 45,47 \n\t"
|
||||||
|
@ -135,8 +135,8 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
|
|
||||||
"vaddudm 1,1,5 \n\t" // get real index for first smaller
|
"vaddudm 1,1,5 \n\t" // get real index for first smaller
|
||||||
|
|
||||||
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
|
"lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
|
"lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
|
||||||
|
|
||||||
//compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39)
|
//compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39)
|
||||||
"xvcmpgtdp 2,39, 3 \n\t"
|
"xvcmpgtdp 2,39, 3 \n\t"
|
||||||
|
@ -156,16 +156,16 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
|
|
||||||
"xvabsdp 48, 48 \n\t"
|
"xvabsdp 48, 48 \n\t"
|
||||||
"xvabsdp 49, 49 \n\t"
|
"xvabsdp 49, 49 \n\t"
|
||||||
"xvabsdp 50, 50 \n\t"
|
"xvabsdp 6, 6 \n\t"
|
||||||
"xvabsdp 51, 51 \n\t"
|
"xvabsdp 7, 7 \n\t"
|
||||||
|
|
||||||
//<-----------jump here from first load
|
//<-----------jump here from first load
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
|
|
||||||
"xvcmpgtdp 2,44,45 \n\t "
|
"xvcmpgtdp 2,44,45 \n\t "
|
||||||
"xvcmpgtdp 3,46,47 \n\t "
|
"xvcmpgtdp 3,46,47 \n\t "
|
||||||
"xvcmpgtdp 4,48,49 \n\t "
|
"xvcmpgtdp 4,48,49 \n\t "
|
||||||
"xvcmpgtdp 5,50,51 \n\t"
|
"xvcmpgtdp 5,6,7 \n\t"
|
||||||
|
|
||||||
"xxsel 32,40,41,2 \n\t"
|
"xxsel 32,40,41,2 \n\t"
|
||||||
"xxsel 0,44,45,2 \n\t"
|
"xxsel 0,44,45,2 \n\t"
|
||||||
|
@ -174,7 +174,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
"xxsel 34,40,41,4 \n\t"
|
"xxsel 34,40,41,4 \n\t"
|
||||||
"xxsel 45,48,49,4 \n\t"
|
"xxsel 45,48,49,4 \n\t"
|
||||||
"xxsel 35,42,43,5 \n\t"
|
"xxsel 35,42,43,5 \n\t"
|
||||||
"xxsel 47,50,51,5 \n\t"
|
"xxsel 47,6,7,5 \n\t"
|
||||||
|
|
||||||
"xvcmpgtdp 2,0, 1 \n\t"
|
"xvcmpgtdp 2,0, 1 \n\t"
|
||||||
"xvcmpgtdp 3, 45,47 \n\t"
|
"xvcmpgtdp 3, 45,47 \n\t"
|
||||||
|
@ -204,8 +204,8 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
|
|
||||||
"vaddudm 1,1,5 \n\t" // get real index for first smaller
|
"vaddudm 1,1,5 \n\t" // get real index for first smaller
|
||||||
|
|
||||||
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
|
"lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
|
"lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -227,21 +227,21 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
|
|
||||||
"xvabsdp 48, 48 \n\t"
|
"xvabsdp 48, 48 \n\t"
|
||||||
"xvabsdp 49, 49 \n\t"
|
"xvabsdp 49, 49 \n\t"
|
||||||
"xvabsdp 50, 50 \n\t"
|
"xvabsdp 6, 6 \n\t"
|
||||||
"xvabsdp 51, 51 \n\t"
|
"xvabsdp 7, 7 \n\t"
|
||||||
|
|
||||||
//decrement n
|
//decrement n
|
||||||
"addic. %[n], %[n], -32 \n\t"
|
"addic. %[n], %[n], -32 \n\t"
|
||||||
|
|
||||||
//Loop back if >0
|
//Loop back if >0
|
||||||
"bgt+ 1b \n\t"
|
"bgt+ one%= \n\t"
|
||||||
|
|
||||||
//==============================================================================
|
//==============================================================================
|
||||||
|
|
||||||
"xvcmpgtdp 2,44,45 \n\t "
|
"xvcmpgtdp 2,44,45 \n\t "
|
||||||
"xvcmpgtdp 3,46,47 \n\t "
|
"xvcmpgtdp 3,46,47 \n\t "
|
||||||
"xvcmpgtdp 4,48,49 \n\t "
|
"xvcmpgtdp 4,48,49 \n\t "
|
||||||
"xvcmpgtdp 5,50,51 \n\t"
|
"xvcmpgtdp 5,6,7 \n\t"
|
||||||
|
|
||||||
"xxsel 32,40,41,2 \n\t"
|
"xxsel 32,40,41,2 \n\t"
|
||||||
"xxsel 0,44,45,2 \n\t"
|
"xxsel 0,44,45,2 \n\t"
|
||||||
|
@ -250,7 +250,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
"xxsel 34,40,41,4 \n\t"
|
"xxsel 34,40,41,4 \n\t"
|
||||||
"xxsel 45,48,49,4 \n\t"
|
"xxsel 45,48,49,4 \n\t"
|
||||||
"xxsel 35,42,43,5 \n\t"
|
"xxsel 35,42,43,5 \n\t"
|
||||||
"xxsel 47,50,51,5 \n\t"
|
"xxsel 47,6,7,5 \n\t"
|
||||||
|
|
||||||
"xvcmpgtdp 2,0, 1 \n\t"
|
"xvcmpgtdp 2,0, 1 \n\t"
|
||||||
"xvcmpgtdp 3, 45,47 \n\t"
|
"xvcmpgtdp 3, 45,47 \n\t"
|
||||||
|
@ -277,28 +277,28 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
|
|
||||||
///////extract min value and min index from vector
|
///////extract min value and min index from vector
|
||||||
|
|
||||||
"xxspltd 32,38,1 \n\t"
|
XXSPLTD_S(32,38,1)
|
||||||
"xxspltd 40,39,1 \n\t"
|
XXSPLTD_S(40,39,1)
|
||||||
"xvcmpeqdp. 2, 40,39 \n\t"
|
"xvcmpeqdp. 2, 40,39 \n\t"
|
||||||
|
|
||||||
//cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
|
//cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
|
||||||
//0b001110=14
|
//0b001110=14
|
||||||
"bc 14,24, 3f \n\t"
|
"bc 14,24, three%= \n\t"
|
||||||
"xvcmpgtdp 4,39, 40 \n\t"
|
"xvcmpgtdp 4,39, 40 \n\t"
|
||||||
"xxsel 0,39,40,4 \n\t"
|
"xxsel 0,39,40,4 \n\t"
|
||||||
"xxsel 1,38,32,4 \n\t"
|
"xxsel 1,38,32,4 \n\t"
|
||||||
"stxsdx 0,0,%[ptr_minf] \n\t"
|
"stxsdx 0,0,%[ptr_minf] \n\t"
|
||||||
"b 4f \n\t"
|
"b four%= \n\t"
|
||||||
|
|
||||||
"3: \n\t"
|
"three%=: \n\t"
|
||||||
//if elements value are equal then choose minimum index
|
//if elements value are equal then choose minimum index
|
||||||
"xxspltd 0,40,0 \n\t"
|
XXSPLTD_S(0,40,0)
|
||||||
"vminud 0,0,6 \n\t" //vs32 vs38
|
"vminud 0,0,6 \n\t" //vs32 vs38
|
||||||
"xxlor 1,32,32 \n\t"
|
"xxlor 1,32,32 \n\t"
|
||||||
"stxsdx 0,0,%[ptr_minf] \n\t"
|
"stxsdx 0,0,%[ptr_minf] \n\t"
|
||||||
|
|
||||||
|
|
||||||
"4: \n\t"
|
"four%=: \n\t"
|
||||||
"mfvsrd %[index],1 \n\t"
|
"mfvsrd %[index],1 \n\t"
|
||||||
|
|
||||||
: [minf] "=m"(*minf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n)
|
: [minf] "=m"(*minf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n)
|
||||||
|
@ -307,7 +307,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
[i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112),
|
[i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112),
|
||||||
[start] "v"(start), [adder] "v"(temp_add_index)
|
[start] "v"(start), [adder] "v"(temp_add_index)
|
||||||
: "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36",
|
: "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36",
|
||||||
"vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51"
|
"vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs6", "vs7"
|
||||||
);
|
);
|
||||||
|
|
||||||
return index;
|
return index;
|
||||||
|
|
|
@ -56,8 +56,8 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
|
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
|
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
|
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
|
"lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
|
"lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
|
||||||
|
|
||||||
"xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8
|
"xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8
|
||||||
"vaddudm 9,8,%[adder] \n\t" //{3,2} vs41
|
"vaddudm 9,8,%[adder] \n\t" //{3,2} vs41
|
||||||
|
@ -67,7 +67,7 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
"vaddudm 11,10,%[adder] \n\t" //{7,6} vs43
|
"vaddudm 11,10,%[adder] \n\t" //{7,6} vs43
|
||||||
"xxlxor 39,39,39 \n\t" // vs39 vec_max_value is zero
|
"xxlxor 39,39,39 \n\t" // vs39 vec_max_value is zero
|
||||||
"vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4
|
"vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4
|
||||||
"xxspltd 36,36,0 \n\t"
|
XXSPLTD_S(36,36,0)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -77,24 +77,24 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
"xvabsdp 47, 47 \n\t"
|
"xvabsdp 47, 47 \n\t"
|
||||||
"xvabsdp 48, 48 \n\t"
|
"xvabsdp 48, 48 \n\t"
|
||||||
"xvabsdp 49, 49 \n\t"
|
"xvabsdp 49, 49 \n\t"
|
||||||
"xvabsdp 50, 50 \n\t"
|
"xvabsdp 6, 6 \n\t"
|
||||||
"xvabsdp 51, 51 \n\t"
|
"xvabsdp 7, 7 \n\t"
|
||||||
|
|
||||||
//jump first half forward
|
//jump first half forward
|
||||||
"b 2f \n\t"
|
"b two%= \n\t"
|
||||||
|
|
||||||
".p2align 5 \n\t"
|
".align 5 \n\t"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
|
|
||||||
"xxmrghd 0,44,45 \n\t"
|
XXMRGHD_S(0,44,45)
|
||||||
"xxmrgld 1,44,45 \n\t"
|
XXMRGLD_S(1,44,45)
|
||||||
"xxmrghd 2,46,47 \n\t"
|
XXMRGHD_S(2,46,47)
|
||||||
"xxmrgld 3,46,47 \n\t"
|
XXMRGLD_S(3,46,47)
|
||||||
"xxmrghd 4,48,49 \n\t"
|
XXMRGHD_S(4,48,49)
|
||||||
"xxmrgld 5,48,49 \n\t"
|
XXMRGLD_S(5,48,49)
|
||||||
"xxmrghd 44,50,51 \n\t"
|
XXMRGHD_S(44,6,7)
|
||||||
"xxmrgld 45,50,51 \n\t"
|
XXMRGLD_S(45,6,7)
|
||||||
|
|
||||||
"xvadddp 46, 0,1 \n\t"
|
"xvadddp 46, 0,1 \n\t"
|
||||||
"xvadddp 47, 2,3 \n\t"
|
"xvadddp 47, 2,3 \n\t"
|
||||||
|
@ -103,15 +103,15 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
"xvcmpgtdp 50,47,46 \n\t "
|
"xvcmpgtdp 6,47,46 \n\t "
|
||||||
"xvcmpgtdp 51,49,48 \n\t "
|
"xvcmpgtdp 7,49,48 \n\t "
|
||||||
|
|
||||||
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
|
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
|
||||||
|
|
||||||
"xxsel 32,40,41,50 \n\t"
|
"xxsel 32,40,41,6 \n\t"
|
||||||
"xxsel 0,46,47,50 \n\t"
|
"xxsel 0,46,47,6 \n\t"
|
||||||
"xxsel 33,42,43,51 \n\t"
|
"xxsel 33,42,43,7 \n\t"
|
||||||
"xxsel 1,48,49,51 \n\t"
|
"xxsel 1,48,49,7 \n\t"
|
||||||
|
|
||||||
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
|
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
|
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
|
||||||
|
@ -133,8 +133,8 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
|
|
||||||
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
|
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
|
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
|
"lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
|
"lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
|
||||||
//select with previous
|
//select with previous
|
||||||
"xxsel 38,38,32,4 \n\t"
|
"xxsel 38,38,32,4 \n\t"
|
||||||
"xxsel 39,39,3,4 \n\t"
|
"xxsel 39,39,3,4 \n\t"
|
||||||
|
@ -148,35 +148,35 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
"xvabsdp 47, 47 \n\t"
|
"xvabsdp 47, 47 \n\t"
|
||||||
"xvabsdp 48, 48 \n\t"
|
"xvabsdp 48, 48 \n\t"
|
||||||
"xvabsdp 49, 49 \n\t"
|
"xvabsdp 49, 49 \n\t"
|
||||||
"xvabsdp 50, 50 \n\t"
|
"xvabsdp 6, 6 \n\t"
|
||||||
"xvabsdp 51, 51 \n\t"
|
"xvabsdp 7, 7 \n\t"
|
||||||
|
|
||||||
|
|
||||||
//>>/////////////////////////////// half start
|
//>>/////////////////////////////// half start
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
"xxmrghd 0,44,45 \n\t"
|
XXMRGHD_S(0,44,45)
|
||||||
"xxmrgld 1,44,45 \n\t"
|
XXMRGLD_S(1,44,45)
|
||||||
"xxmrghd 2,46,47 \n\t"
|
XXMRGHD_S(2,46,47)
|
||||||
"xxmrgld 3,46,47 \n\t"
|
XXMRGLD_S(3,46,47)
|
||||||
"xxmrghd 4,48,49 \n\t"
|
XXMRGHD_S(4,48,49)
|
||||||
"xxmrgld 5,48,49 \n\t"
|
XXMRGLD_S(5,48,49)
|
||||||
"xxmrghd 44,50,51 \n\t"
|
XXMRGHD_S(44,6,7)
|
||||||
"xxmrgld 45,50,51 \n\t"
|
XXMRGLD_S(45,6,7)
|
||||||
|
|
||||||
"xvadddp 46, 0,1 \n\t"
|
"xvadddp 46, 0,1 \n\t"
|
||||||
"xvadddp 47, 2,3 \n\t"
|
"xvadddp 47, 2,3 \n\t"
|
||||||
"xvadddp 48, 4,5 \n\t"
|
"xvadddp 48, 4,5 \n\t"
|
||||||
"xvadddp 49, 44,45 \n\t"
|
"xvadddp 49, 44,45 \n\t"
|
||||||
|
|
||||||
"xvcmpgtdp 50,47,46 \n\t "
|
"xvcmpgtdp 6,47,46 \n\t "
|
||||||
"xvcmpgtdp 51,49,48 \n\t "
|
"xvcmpgtdp 7,49,48 \n\t "
|
||||||
|
|
||||||
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
|
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
|
||||||
|
|
||||||
"xxsel 32,40,41,50 \n\t"
|
"xxsel 32,40,41,6 \n\t"
|
||||||
"xxsel 0,46,47,50 \n\t"
|
"xxsel 0,46,47,6 \n\t"
|
||||||
"xxsel 33,42,43,51 \n\t"
|
"xxsel 33,42,43,7 \n\t"
|
||||||
"xxsel 1,48,49,51 \n\t"
|
"xxsel 1,48,49,7 \n\t"
|
||||||
|
|
||||||
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
|
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
|
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
|
||||||
|
@ -198,8 +198,8 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
|
|
||||||
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
|
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
|
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
|
"lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
|
"lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
|
||||||
//select with previous
|
//select with previous
|
||||||
"xxsel 38,38,32,4 \n\t"
|
"xxsel 38,38,32,4 \n\t"
|
||||||
"xxsel 39,39,3,4 \n\t"
|
"xxsel 39,39,3,4 \n\t"
|
||||||
|
@ -211,24 +211,24 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
"xvabsdp 47, 47 \n\t"
|
"xvabsdp 47, 47 \n\t"
|
||||||
"xvabsdp 48, 48 \n\t"
|
"xvabsdp 48, 48 \n\t"
|
||||||
"xvabsdp 49, 49 \n\t"
|
"xvabsdp 49, 49 \n\t"
|
||||||
"xvabsdp 50, 50 \n\t"
|
"xvabsdp 6, 6 \n\t"
|
||||||
"xvabsdp 51, 51 \n\t"
|
"xvabsdp 7, 7 \n\t"
|
||||||
|
|
||||||
|
|
||||||
//decrement n
|
//decrement n
|
||||||
"addic. %[n], %[n], -16 \n\t"
|
"addic. %[n], %[n], -16 \n\t"
|
||||||
//Loop back if >0
|
//Loop back if >0
|
||||||
"bgt+ 1b \n\t"
|
"bgt+ one%= \n\t"
|
||||||
|
|
||||||
|
|
||||||
"xxmrghd 0,44,45 \n\t"
|
XXMRGHD_S(0,44,45)
|
||||||
"xxmrgld 1,44,45 \n\t"
|
XXMRGLD_S(1,44,45)
|
||||||
"xxmrghd 2,46,47 \n\t"
|
XXMRGHD_S(2,46,47)
|
||||||
"xxmrgld 3,46,47 \n\t"
|
XXMRGLD_S(3,46,47)
|
||||||
"xxmrghd 4,48,49 \n\t"
|
XXMRGHD_S(4,48,49)
|
||||||
"xxmrgld 5,48,49 \n\t"
|
XXMRGLD_S(5,48,49)
|
||||||
"xxmrghd 44,50,51 \n\t"
|
XXMRGHD_S(44,6,7)
|
||||||
"xxmrgld 45,50,51 \n\t"
|
XXMRGLD_S(45,6,7)
|
||||||
|
|
||||||
"xvadddp 46, 0,1 \n\t"
|
"xvadddp 46, 0,1 \n\t"
|
||||||
"xvadddp 47, 2,3 \n\t"
|
"xvadddp 47, 2,3 \n\t"
|
||||||
|
@ -237,13 +237,13 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
"xvcmpgtdp 50,47,46 \n\t "
|
"xvcmpgtdp 6,47,46 \n\t "
|
||||||
"xvcmpgtdp 51,49,48 \n\t "
|
"xvcmpgtdp 7,49,48 \n\t "
|
||||||
|
|
||||||
"xxsel 32,40,41,50 \n\t"
|
"xxsel 32,40,41,6 \n\t"
|
||||||
"xxsel 0,46,47,50 \n\t"
|
"xxsel 0,46,47,6 \n\t"
|
||||||
"xxsel 33,42,43,51 \n\t"
|
"xxsel 33,42,43,7 \n\t"
|
||||||
"xxsel 1,48,49,51 \n\t"
|
"xxsel 1,48,49,7 \n\t"
|
||||||
|
|
||||||
"xvcmpgtdp 2,1,0 \n\t "
|
"xvcmpgtdp 2,1,0 \n\t "
|
||||||
"xxsel 32,32,33,2 \n\t"
|
"xxsel 32,32,33,2 \n\t"
|
||||||
|
@ -262,28 +262,28 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
|
|
||||||
///////extract max value and max index from vector
|
///////extract max value and max index from vector
|
||||||
|
|
||||||
"xxspltd 32,38,1 \n\t"
|
XXSPLTD_S(32,38,1)
|
||||||
"xxspltd 40,39,1 \n\t"
|
XXSPLTD_S(40,39,1)
|
||||||
"xvcmpeqdp. 2, 40,39 \n\t"
|
"xvcmpeqdp. 2, 40,39 \n\t"
|
||||||
|
|
||||||
//cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
|
//cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
|
||||||
//0b001110=14
|
//0b001110=14
|
||||||
"bc 14,24, 3f \n\t"
|
"bc 14,24, three%= \n\t"
|
||||||
"xvcmpgtdp 4, 40,39 \n\t"
|
"xvcmpgtdp 4, 40,39 \n\t"
|
||||||
"xxsel 0,39,40,4 \n\t"
|
"xxsel 0,39,40,4 \n\t"
|
||||||
"xxsel 1,38,32,4 \n\t"
|
"xxsel 1,38,32,4 \n\t"
|
||||||
"stxsdx 0,0,%[ptr_maxf] \n\t"
|
"stxsdx 0,0,%[ptr_maxf] \n\t"
|
||||||
"b 4f \n\t"
|
"b four%= \n\t"
|
||||||
|
|
||||||
"3: \n\t"
|
"three%=: \n\t"
|
||||||
//if elements value are equal then choose minimum index
|
//if elements value are equal then choose minimum index
|
||||||
"xxspltd 0,40,0 \n\t"
|
XXSPLTD_S(0,40,0)
|
||||||
"vminud 0,0,6 \n\t" //vs32 vs38
|
"vminud 0,0,6 \n\t" //vs32 vs38
|
||||||
"xxlor 1,32,32 \n\t"
|
"xxlor 1,32,32 \n\t"
|
||||||
"stxsdx 0,0,%[ptr_maxf] \n\t"
|
"stxsdx 0,0,%[ptr_maxf] \n\t"
|
||||||
|
|
||||||
|
|
||||||
"4: \n\t"
|
"four%=: \n\t"
|
||||||
"mfvsrd %[index],1 \n\t"
|
"mfvsrd %[index],1 \n\t"
|
||||||
|
|
||||||
: [maxf] "=m"(*maxf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n)
|
: [maxf] "=m"(*maxf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n)
|
||||||
|
@ -292,7 +292,7 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
[i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112),
|
[i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112),
|
||||||
[start] "v"(start), [adder] "v"(temp_add_index)
|
[start] "v"(start), [adder] "v"(temp_add_index)
|
||||||
: "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36",
|
: "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36",
|
||||||
"vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51"
|
"vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs6", "vs7"
|
||||||
);
|
);
|
||||||
|
|
||||||
return index;
|
return index;
|
||||||
|
|
|
@ -54,8 +54,8 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
|
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
|
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
|
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
|
"lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
|
"lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
|
||||||
|
|
||||||
"xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8
|
"xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8
|
||||||
"vaddudm 9,8,%[adder] \n\t" //{3,2} vs41
|
"vaddudm 9,8,%[adder] \n\t" //{3,2} vs41
|
||||||
|
@ -65,7 +65,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
"vaddudm 11,10,%[adder] \n\t" //{7,6} vs43
|
"vaddudm 11,10,%[adder] \n\t" //{7,6} vs43
|
||||||
"lxvdsx 39,0,%[ptr_minf] \n\t" // vs39 vec_min_value
|
"lxvdsx 39,0,%[ptr_minf] \n\t" // vs39 vec_min_value
|
||||||
"vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4
|
"vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4
|
||||||
"xxspltd 36,36,0 \n\t"
|
XXSPLTD_S(36,36,0)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -75,24 +75,24 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
"xvabsdp 47, 47 \n\t"
|
"xvabsdp 47, 47 \n\t"
|
||||||
"xvabsdp 48, 48 \n\t"
|
"xvabsdp 48, 48 \n\t"
|
||||||
"xvabsdp 49, 49 \n\t"
|
"xvabsdp 49, 49 \n\t"
|
||||||
"xvabsdp 50, 50 \n\t"
|
"xvabsdp 6, 6 \n\t"
|
||||||
"xvabsdp 51, 51 \n\t"
|
"xvabsdp 7, 7 \n\t"
|
||||||
|
|
||||||
//jump first half forward
|
//jump first half forward
|
||||||
"b 2f \n\t"
|
"b two%= \n\t"
|
||||||
|
|
||||||
".p2align 5 \n\t"
|
".align 5 \n\t"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
|
|
||||||
"xxmrghd 0,44,45 \n\t"
|
XXMRGHD_S(0,44,45)
|
||||||
"xxmrgld 1,44,45 \n\t"
|
XXMRGLD_S(1,44,45)
|
||||||
"xxmrghd 2,46,47 \n\t"
|
XXMRGHD_S(2,46,47)
|
||||||
"xxmrgld 3,46,47 \n\t"
|
XXMRGLD_S(3,46,47)
|
||||||
"xxmrghd 4,48,49 \n\t"
|
XXMRGHD_S(4,48,49)
|
||||||
"xxmrgld 5,48,49 \n\t"
|
XXMRGLD_S(5,48,49)
|
||||||
"xxmrghd 44,50,51 \n\t"
|
XXMRGHD_S(44,6,7)
|
||||||
"xxmrgld 45,50,51 \n\t"
|
XXMRGLD_S(45,6,7)
|
||||||
|
|
||||||
"xvadddp 46, 0,1 \n\t"
|
"xvadddp 46, 0,1 \n\t"
|
||||||
"xvadddp 47, 2,3 \n\t"
|
"xvadddp 47, 2,3 \n\t"
|
||||||
|
@ -101,15 +101,15 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
"xvcmpgtdp 50,46,47 \n\t "
|
"xvcmpgtdp 6,46,47 \n\t "
|
||||||
"xvcmpgtdp 51,48,49 \n\t "
|
"xvcmpgtdp 7,48,49 \n\t "
|
||||||
|
|
||||||
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
|
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
|
||||||
|
|
||||||
"xxsel 32,40,41,50 \n\t"
|
"xxsel 32,40,41,6 \n\t"
|
||||||
"xxsel 0,46,47,50 \n\t"
|
"xxsel 0,46,47,6 \n\t"
|
||||||
"xxsel 33,42,43,51 \n\t"
|
"xxsel 33,42,43,7 \n\t"
|
||||||
"xxsel 1,48,49,51 \n\t"
|
"xxsel 1,48,49,7 \n\t"
|
||||||
|
|
||||||
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
|
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
|
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
|
||||||
|
@ -131,8 +131,8 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
|
|
||||||
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
|
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
|
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
|
"lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
|
"lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
|
||||||
//select with previous
|
//select with previous
|
||||||
"xxsel 38,38,32,4 \n\t"
|
"xxsel 38,38,32,4 \n\t"
|
||||||
"xxsel 39,39,3,4 \n\t"
|
"xxsel 39,39,3,4 \n\t"
|
||||||
|
@ -146,35 +146,35 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
"xvabsdp 47, 47 \n\t"
|
"xvabsdp 47, 47 \n\t"
|
||||||
"xvabsdp 48, 48 \n\t"
|
"xvabsdp 48, 48 \n\t"
|
||||||
"xvabsdp 49, 49 \n\t"
|
"xvabsdp 49, 49 \n\t"
|
||||||
"xvabsdp 50, 50 \n\t"
|
"xvabsdp 6, 6 \n\t"
|
||||||
"xvabsdp 51, 51 \n\t"
|
"xvabsdp 7, 7 \n\t"
|
||||||
|
|
||||||
|
|
||||||
//>>/////////////////////////////// half start
|
//>>/////////////////////////////// half start
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
"xxmrghd 0,44,45 \n\t"
|
XXMRGHD_S(0,44,45)
|
||||||
"xxmrgld 1,44,45 \n\t"
|
XXMRGLD_S(1,44,45)
|
||||||
"xxmrghd 2,46,47 \n\t"
|
XXMRGHD_S(2,46,47)
|
||||||
"xxmrgld 3,46,47 \n\t"
|
XXMRGLD_S(3,46,47)
|
||||||
"xxmrghd 4,48,49 \n\t"
|
XXMRGHD_S(4,48,49)
|
||||||
"xxmrgld 5,48,49 \n\t"
|
XXMRGLD_S(5,48,49)
|
||||||
"xxmrghd 44,50,51 \n\t"
|
XXMRGHD_S(44,6,7)
|
||||||
"xxmrgld 45,50,51 \n\t"
|
XXMRGLD_S(45,6,7)
|
||||||
|
|
||||||
"xvadddp 46, 0,1 \n\t"
|
"xvadddp 46, 0,1 \n\t"
|
||||||
"xvadddp 47, 2,3 \n\t"
|
"xvadddp 47, 2,3 \n\t"
|
||||||
"xvadddp 48, 4,5 \n\t"
|
"xvadddp 48, 4,5 \n\t"
|
||||||
"xvadddp 49, 44,45 \n\t"
|
"xvadddp 49, 44,45 \n\t"
|
||||||
|
|
||||||
"xvcmpgtdp 50,46,47 \n\t "
|
"xvcmpgtdp 6,46,47 \n\t "
|
||||||
"xvcmpgtdp 51,48,49 \n\t "
|
"xvcmpgtdp 7,48,49 \n\t "
|
||||||
|
|
||||||
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
|
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
|
||||||
|
|
||||||
"xxsel 32,40,41,50 \n\t"
|
"xxsel 32,40,41,6 \n\t"
|
||||||
"xxsel 0,46,47,50 \n\t"
|
"xxsel 0,46,47,6 \n\t"
|
||||||
"xxsel 33,42,43,51 \n\t"
|
"xxsel 33,42,43,7 \n\t"
|
||||||
"xxsel 1,48,49,51 \n\t"
|
"xxsel 1,48,49,7 \n\t"
|
||||||
|
|
||||||
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
|
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
|
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
|
||||||
|
@ -196,8 +196,8 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
|
|
||||||
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
|
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
|
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
|
"lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
|
||||||
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
|
"lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
|
||||||
//select with previous
|
//select with previous
|
||||||
"xxsel 38,38,32,4 \n\t"
|
"xxsel 38,38,32,4 \n\t"
|
||||||
"xxsel 39,39,3,4 \n\t"
|
"xxsel 39,39,3,4 \n\t"
|
||||||
|
@ -209,24 +209,24 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
"xvabsdp 47, 47 \n\t"
|
"xvabsdp 47, 47 \n\t"
|
||||||
"xvabsdp 48, 48 \n\t"
|
"xvabsdp 48, 48 \n\t"
|
||||||
"xvabsdp 49, 49 \n\t"
|
"xvabsdp 49, 49 \n\t"
|
||||||
"xvabsdp 50, 50 \n\t"
|
"xvabsdp 6, 6 \n\t"
|
||||||
"xvabsdp 51, 51 \n\t"
|
"xvabsdp 7, 7 \n\t"
|
||||||
|
|
||||||
|
|
||||||
//decrement n
|
//decrement n
|
||||||
"addic. %[n], %[n], -16 \n\t"
|
"addic. %[n], %[n], -16 \n\t"
|
||||||
//Loop back if >0
|
//Loop back if >0
|
||||||
"bgt+ 1b \n\t"
|
"bgt+ one%= \n\t"
|
||||||
|
|
||||||
|
|
||||||
"xxmrghd 0,44,45 \n\t"
|
XXMRGHD_S(0,44,45)
|
||||||
"xxmrgld 1,44,45 \n\t"
|
XXMRGLD_S(1,44,45)
|
||||||
"xxmrghd 2,46,47 \n\t"
|
XXMRGHD_S(2,46,47)
|
||||||
"xxmrgld 3,46,47 \n\t"
|
XXMRGLD_S(3,46,47)
|
||||||
"xxmrghd 4,48,49 \n\t"
|
XXMRGHD_S(4,48,49)
|
||||||
"xxmrgld 5,48,49 \n\t"
|
XXMRGLD_S(5,48,49)
|
||||||
"xxmrghd 44,50,51 \n\t"
|
XXMRGHD_S(44,6,7)
|
||||||
"xxmrgld 45,50,51 \n\t"
|
XXMRGLD_S(45,6,7)
|
||||||
|
|
||||||
"xvadddp 46, 0,1 \n\t"
|
"xvadddp 46, 0,1 \n\t"
|
||||||
"xvadddp 47, 2,3 \n\t"
|
"xvadddp 47, 2,3 \n\t"
|
||||||
|
@ -235,13 +235,13 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
"xvcmpgtdp 50,46,47 \n\t "
|
"xvcmpgtdp 6,46,47 \n\t "
|
||||||
"xvcmpgtdp 51,48,49 \n\t "
|
"xvcmpgtdp 7,48,49 \n\t "
|
||||||
|
|
||||||
"xxsel 32,40,41,50 \n\t"
|
"xxsel 32,40,41,6 \n\t"
|
||||||
"xxsel 0,46,47,50 \n\t"
|
"xxsel 0,46,47,6 \n\t"
|
||||||
"xxsel 33,42,43,51 \n\t"
|
"xxsel 33,42,43,7 \n\t"
|
||||||
"xxsel 1,48,49,51 \n\t"
|
"xxsel 1,48,49,7 \n\t"
|
||||||
|
|
||||||
"xvcmpgtdp 2,0,1 \n\t "
|
"xvcmpgtdp 2,0,1 \n\t "
|
||||||
"xxsel 32,32,33,2 \n\t"
|
"xxsel 32,32,33,2 \n\t"
|
||||||
|
@ -260,28 +260,28 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
|
|
||||||
///////extract min value and min index from vector
|
///////extract min value and min index from vector
|
||||||
|
|
||||||
"xxspltd 32,38,1 \n\t"
|
XXSPLTD_S(32,38,1)
|
||||||
"xxspltd 40,39,1 \n\t"
|
XXSPLTD_S(40,39,1)
|
||||||
"xvcmpeqdp. 2, 40,39 \n\t"
|
"xvcmpeqdp. 2, 40,39 \n\t"
|
||||||
|
|
||||||
//cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
|
//cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
|
||||||
//0b001110=14
|
//0b001110=14
|
||||||
"bc 14,24, 3f \n\t"
|
"bc 14,24, three%= \n\t"
|
||||||
"xvcmpgtdp 4,39, 40 \n\t"
|
"xvcmpgtdp 4,39, 40 \n\t"
|
||||||
"xxsel 0,39,40,4 \n\t"
|
"xxsel 0,39,40,4 \n\t"
|
||||||
"xxsel 1,38,32,4 \n\t"
|
"xxsel 1,38,32,4 \n\t"
|
||||||
"stxsdx 0,0,%[ptr_minf] \n\t"
|
"stxsdx 0,0,%[ptr_minf] \n\t"
|
||||||
"b 4f \n\t"
|
"b four%= \n\t"
|
||||||
|
|
||||||
"3: \n\t"
|
"three%=: \n\t"
|
||||||
//if elements value are equal then choose minimum index
|
//if elements value are equal then choose minimum index
|
||||||
"xxspltd 0,40,0 \n\t"
|
XXSPLTD_S(0,40,0)
|
||||||
"vminud 0,0,6 \n\t" //vs32 vs38
|
"vminud 0,0,6 \n\t" //vs32 vs38
|
||||||
"xxlor 1,32,32 \n\t"
|
"xxlor 1,32,32 \n\t"
|
||||||
"stxsdx 0,0,%[ptr_minf] \n\t"
|
"stxsdx 0,0,%[ptr_minf] \n\t"
|
||||||
|
|
||||||
|
|
||||||
"4: \n\t"
|
"four%=: \n\t"
|
||||||
"mfvsrd %[index],1 \n\t"
|
"mfvsrd %[index],1 \n\t"
|
||||||
|
|
||||||
: [minf] "=m"(*minf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n)
|
: [minf] "=m"(*minf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n)
|
||||||
|
@ -290,7 +290,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
[i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112),
|
[i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112),
|
||||||
[start] "v"(start), [adder] "v"(temp_add_index)
|
[start] "v"(start), [adder] "v"(temp_add_index)
|
||||||
: "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36",
|
: "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36",
|
||||||
"vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51"
|
"vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs6", "vs7"
|
||||||
);
|
);
|
||||||
|
|
||||||
return index;
|
return index;
|
||||||
|
|
|
@ -46,10 +46,10 @@ static void __inline blas_lock(volatile BLASULONG *address){
|
||||||
" .machine \"any\" ;"
|
" .machine \"any\" ;"
|
||||||
"0: lwarx %0,0, %1 ;"
|
"0: lwarx %0,0, %1 ;"
|
||||||
" cmpwi 0,%0,0;"
|
" cmpwi 0,%0,0;"
|
||||||
" bne 1f;"
|
" bne one%=;"
|
||||||
" stwcx. %2,0, %1 ;"
|
" stwcx. %2,0, %1 ;"
|
||||||
" bne- 0b;"
|
" bne- 0b;"
|
||||||
"1: "
|
"one%=: "
|
||||||
: "=&r"(ret)
|
: "=&r"(ret)
|
||||||
: "r"(address), "r" (val)
|
: "r"(address), "r" (val)
|
||||||
: "cr0", "memory");
|
: "cr0", "memory");
|
||||||
|
|
|
@ -68,10 +68,10 @@ static float sasum_kernel_32 (long n, float *x)
|
||||||
"addi %2, %2, 128 \n\t"
|
"addi %2, %2, 128 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -32 \n\t"
|
"addic. %1, %1, -32 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
|
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"xvabssp 48, 40 \n\t"
|
"xvabssp 48, 40 \n\t"
|
||||||
"xvabssp 49, 41 \n\t"
|
"xvabssp 49, 41 \n\t"
|
||||||
|
@ -108,9 +108,9 @@ static float sasum_kernel_32 (long n, float *x)
|
||||||
"xvaddsp 38, 38, %x5 \n\t"
|
"xvaddsp 38, 38, %x5 \n\t"
|
||||||
"xvaddsp 39, 39, %x6 \n\t"
|
"xvaddsp 39, 39, %x6 \n\t"
|
||||||
|
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
|
|
||||||
"xvabssp 48, 40 \n\t"
|
"xvabssp 48, 40 \n\t"
|
||||||
"xvabssp 49, 41 \n\t"
|
"xvabssp 49, 41 \n\t"
|
||||||
|
|
|
@ -51,10 +51,10 @@ static void scopy_kernel_32 (long n, float *x, float *y)
|
||||||
"addi %2, %2, 128 \n\t"
|
"addi %2, %2, 128 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -32 \n\t"
|
"addic. %1, %1, -32 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
|
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"stxvd2x 40, 0, %3 \n\t"
|
"stxvd2x 40, 0, %3 \n\t"
|
||||||
"stxvd2x 41, %5, %3 \n\t"
|
"stxvd2x 41, %5, %3 \n\t"
|
||||||
|
@ -77,9 +77,9 @@ static void scopy_kernel_32 (long n, float *x, float *y)
|
||||||
"addi %2, %2, 128 \n\t"
|
"addi %2, %2, 128 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -32 \n\t"
|
"addic. %1, %1, -32 \n\t"
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
|
|
||||||
"stxvd2x 40, 0, %3 \n\t"
|
"stxvd2x 40, 0, %3 \n\t"
|
||||||
"stxvd2x 41, %5, %3 \n\t"
|
"stxvd2x 41, %5, %3 \n\t"
|
||||||
|
|
|
@ -78,10 +78,10 @@ static float sdot_kernel_16 (long n, float *x, float *y)
|
||||||
"addi %3, %3, 128 \n\t"
|
"addi %3, %3, 128 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -32 \n\t"
|
"addic. %1, %1, -32 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
|
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"xvmaddasp 32, 40, 48 \n\t"
|
"xvmaddasp 32, 40, 48 \n\t"
|
||||||
"lxvd2x 40, 0, %2 \n\t"
|
"lxvd2x 40, 0, %2 \n\t"
|
||||||
|
@ -112,9 +112,9 @@ static float sdot_kernel_16 (long n, float *x, float *y)
|
||||||
"addi %3, %3, 128 \n\t"
|
"addi %3, %3, 128 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -32 \n\t"
|
"addic. %1, %1, -32 \n\t"
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
|
|
||||||
"xvmaddasp 32, 40, 48 \n\t"
|
"xvmaddasp 32, 40, 48 \n\t"
|
||||||
"xvmaddasp 33, 41, 49 \n\t"
|
"xvmaddasp 33, 41, 49 \n\t"
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
* Macros for N=4 and M=16
|
* Macros for N=4 and M=16
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x16', `
|
||||||
|
#else
|
||||||
.macro COPY_4x16
|
.macro COPY_4x16
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
lxvw4x vs33, o16, A0
|
lxvw4x vs33, o16, A0
|
||||||
|
@ -88,13 +92,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvw4x vs46, o32, T1
|
stxvw4x vs46, o32, T1
|
||||||
stxvw4x vs47, o48, T1
|
stxvw4x vs47, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=4 and M=8
|
* Macros for N=4 and M=8
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x8', `
|
||||||
|
#else
|
||||||
.macro COPY_4x8
|
.macro COPY_4x8
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
lxvw4x vs33, o16, A0
|
lxvw4x vs33, o16, A0
|
||||||
|
@ -124,13 +136,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvw4x vs38, o32, T1
|
stxvw4x vs38, o32, T1
|
||||||
stxvw4x vs39, o48, T1
|
stxvw4x vs39, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=4 and M=4
|
* Macros for N=4 and M=4
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x4', `
|
||||||
|
#else
|
||||||
.macro COPY_4x4
|
.macro COPY_4x4
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
|
|
||||||
|
@ -150,13 +170,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxvw4x vs35, o48, T1
|
stxvw4x vs35, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=4 and M=2
|
* Macros for N=4 and M=2
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x2', `
|
||||||
|
#else
|
||||||
.macro COPY_4x2
|
.macro COPY_4x2
|
||||||
|
#endif
|
||||||
|
|
||||||
lxsspx vs32, o0, A0
|
lxsspx vs32, o0, A0
|
||||||
lxsspx vs33, o4, A0
|
lxsspx vs33, o4, A0
|
||||||
|
@ -190,13 +218,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxsspx vs38, o0, T1
|
stxsspx vs38, o0, T1
|
||||||
stxsspx vs39, o4, T1
|
stxsspx vs39, o4, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=4 and M=1
|
* Macros for N=4 and M=1
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x1', `
|
||||||
|
#else
|
||||||
.macro COPY_4x1
|
.macro COPY_4x1
|
||||||
|
#endif
|
||||||
|
|
||||||
lxsspx vs32, o0, A0
|
lxsspx vs32, o0, A0
|
||||||
|
|
||||||
|
@ -218,13 +254,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxsspx vs35, o4, T1
|
stxsspx vs35, o4, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=16
|
* Macros for N=2 and M=16
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x16', `
|
||||||
|
#else
|
||||||
.macro COPY_2x16
|
.macro COPY_2x16
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
lxvw4x vs33, o16, A0
|
lxvw4x vs33, o16, A0
|
||||||
|
@ -250,13 +294,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvw4x vs38, o32, T1
|
stxvw4x vs38, o32, T1
|
||||||
stxvw4x vs39, o48, T1
|
stxvw4x vs39, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=8
|
* Macros for N=2 and M=8
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x8', `
|
||||||
|
#else
|
||||||
.macro COPY_2x8
|
.macro COPY_2x8
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
lxvw4x vs33, o16, A0
|
lxvw4x vs33, o16, A0
|
||||||
|
@ -272,13 +324,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvw4x vs34, o32, T1
|
stxvw4x vs34, o32, T1
|
||||||
stxvw4x vs35, o48, T1
|
stxvw4x vs35, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=4
|
* Macros for N=2 and M=4
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x4', `
|
||||||
|
#else
|
||||||
.macro COPY_2x4
|
.macro COPY_2x4
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
|
|
||||||
|
@ -290,13 +350,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxvw4x vs33, o16, T1
|
stxvw4x vs33, o16, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=2
|
* Macros for N=2 and M=2
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x2', `
|
||||||
|
#else
|
||||||
.macro COPY_2x2
|
.macro COPY_2x2
|
||||||
|
#endif
|
||||||
|
|
||||||
lxsspx vs32, o0, A0
|
lxsspx vs32, o0, A0
|
||||||
lxsspx vs33, o4, A0
|
lxsspx vs33, o4, A0
|
||||||
|
@ -314,13 +382,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxsspx vs34, o0, T1
|
stxsspx vs34, o0, T1
|
||||||
stxsspx vs35, o4, T1
|
stxsspx vs35, o4, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=1
|
* Macros for N=2 and M=1
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x1', `
|
||||||
|
#else
|
||||||
.macro COPY_2x1
|
.macro COPY_2x1
|
||||||
|
#endif
|
||||||
|
|
||||||
lxsspx vs32, o0, A0
|
lxsspx vs32, o0, A0
|
||||||
|
|
||||||
|
@ -332,13 +408,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxsspx vs33, o4, T1
|
stxsspx vs33, o4, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=16
|
* Macros for N=1 and M=16
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x16', `
|
||||||
|
#else
|
||||||
.macro COPY_1x16
|
.macro COPY_1x16
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
lxvw4x vs33, o16, A0
|
lxvw4x vs33, o16, A0
|
||||||
|
@ -352,13 +436,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvw4x vs34, o32, T1
|
stxvw4x vs34, o32, T1
|
||||||
stxvw4x vs35, o48, T1
|
stxvw4x vs35, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=8
|
* Macros for N=1 and M=8
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x8', `
|
||||||
|
#else
|
||||||
.macro COPY_1x8
|
.macro COPY_1x8
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
lxvw4x vs33, o16, A0
|
lxvw4x vs33, o16, A0
|
||||||
|
@ -368,13 +460,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvw4x vs32, o0, T1
|
stxvw4x vs32, o0, T1
|
||||||
stxvw4x vs33, o16, T1
|
stxvw4x vs33, o16, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=4
|
* Macros for N=1 and M=4
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x4', `
|
||||||
|
#else
|
||||||
.macro COPY_1x4
|
.macro COPY_1x4
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
|
|
||||||
|
@ -382,13 +482,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxvw4x vs32, o0, T1
|
stxvw4x vs32, o0, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=2
|
* Macros for N=1 and M=2
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x2', `
|
||||||
|
#else
|
||||||
.macro COPY_1x2
|
.macro COPY_1x2
|
||||||
|
#endif
|
||||||
|
|
||||||
lxsspx vs32, o0, A0
|
lxsspx vs32, o0, A0
|
||||||
lxsspx vs33, o4, A0
|
lxsspx vs33, o4, A0
|
||||||
|
@ -398,13 +506,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxsspx vs32, o0, T1
|
stxsspx vs32, o0, T1
|
||||||
stxsspx vs33, o4, T1
|
stxsspx vs33, o4, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=1
|
* Macros for N=1 and M=1
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x1', `
|
||||||
|
#else
|
||||||
.macro COPY_1x1
|
.macro COPY_1x1
|
||||||
|
#endif
|
||||||
|
|
||||||
lxsspx vs32, o0, A0
|
lxsspx vs32, o0, A0
|
||||||
|
|
||||||
|
@ -412,5 +528,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxsspx vs32, o0, T1
|
stxsspx vs32, o0, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
* Macros for N=4 and M=8
|
* Macros for N=4 and M=8
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x8', `
|
||||||
|
#else
|
||||||
.macro COPY_4x8
|
.macro COPY_4x8
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
lxvw4x vs33, o16, A0
|
lxvw4x vs33, o16, A0
|
||||||
|
@ -68,13 +72,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvw4x vs38, o32, T1
|
stxvw4x vs38, o32, T1
|
||||||
stxvw4x vs39, o48, T1
|
stxvw4x vs39, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=4 and M=4
|
* Macros for N=4 and M=4
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x4', `
|
||||||
|
#else
|
||||||
.macro COPY_4x4
|
.macro COPY_4x4
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
|
|
||||||
|
@ -94,13 +106,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxvw4x vs35, o48, T1
|
stxvw4x vs35, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=4 and M=2
|
* Macros for N=4 and M=2
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x2', `
|
||||||
|
#else
|
||||||
.macro COPY_4x2
|
.macro COPY_4x2
|
||||||
|
#endif
|
||||||
|
|
||||||
lxsspx vs32, o0, A0
|
lxsspx vs32, o0, A0
|
||||||
lxsspx vs33, o4, A0
|
lxsspx vs33, o4, A0
|
||||||
|
@ -134,13 +154,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxsspx vs38, o0, T1
|
stxsspx vs38, o0, T1
|
||||||
stxsspx vs39, o4, T1
|
stxsspx vs39, o4, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=4 and M=1
|
* Macros for N=4 and M=1
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x1', `
|
||||||
|
#else
|
||||||
.macro COPY_4x1
|
.macro COPY_4x1
|
||||||
|
#endif
|
||||||
|
|
||||||
lxsspx vs32, o0, A0
|
lxsspx vs32, o0, A0
|
||||||
|
|
||||||
|
@ -162,13 +190,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxsspx vs35, o4, T1
|
stxsspx vs35, o4, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=8
|
* Macros for N=2 and M=8
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x8', `
|
||||||
|
#else
|
||||||
.macro COPY_2x8
|
.macro COPY_2x8
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
lxvw4x vs33, o16, A0
|
lxvw4x vs33, o16, A0
|
||||||
|
@ -184,13 +220,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvw4x vs34, o32, T1
|
stxvw4x vs34, o32, T1
|
||||||
stxvw4x vs35, o48, T1
|
stxvw4x vs35, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=4
|
* Macros for N=2 and M=4
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x4', `
|
||||||
|
#else
|
||||||
.macro COPY_2x4
|
.macro COPY_2x4
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
|
|
||||||
|
@ -202,13 +246,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxvw4x vs33, o16, T1
|
stxvw4x vs33, o16, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=2
|
* Macros for N=2 and M=2
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x2', `
|
||||||
|
#else
|
||||||
.macro COPY_2x2
|
.macro COPY_2x2
|
||||||
|
#endif
|
||||||
|
|
||||||
lxsspx vs32, o0, A0
|
lxsspx vs32, o0, A0
|
||||||
lxsspx vs33, o4, A0
|
lxsspx vs33, o4, A0
|
||||||
|
@ -226,13 +278,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxsspx vs34, o0, T1
|
stxsspx vs34, o0, T1
|
||||||
stxsspx vs35, o4, T1
|
stxsspx vs35, o4, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=1
|
* Macros for N=2 and M=1
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x1', `
|
||||||
|
#else
|
||||||
.macro COPY_2x1
|
.macro COPY_2x1
|
||||||
|
#endif
|
||||||
|
|
||||||
lxsspx vs32, o0, A0
|
lxsspx vs32, o0, A0
|
||||||
|
|
||||||
|
@ -244,13 +304,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxsspx vs33, o4, T1
|
stxsspx vs33, o4, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=8
|
* Macros for N=1 and M=8
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x8', `
|
||||||
|
#else
|
||||||
.macro COPY_1x8
|
.macro COPY_1x8
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
lxvw4x vs33, o16, A0
|
lxvw4x vs33, o16, A0
|
||||||
|
@ -260,13 +328,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvw4x vs32, o0, T1
|
stxvw4x vs32, o0, T1
|
||||||
stxvw4x vs33, o16, T1
|
stxvw4x vs33, o16, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=4
|
* Macros for N=1 and M=4
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x4', `
|
||||||
|
#else
|
||||||
.macro COPY_1x4
|
.macro COPY_1x4
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvw4x vs32, o0, A0
|
lxvw4x vs32, o0, A0
|
||||||
|
|
||||||
|
@ -274,13 +350,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxvw4x vs32, o0, T1
|
stxvw4x vs32, o0, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=2
|
* Macros for N=1 and M=2
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x2', `
|
||||||
|
#else
|
||||||
.macro COPY_1x2
|
.macro COPY_1x2
|
||||||
|
#endif
|
||||||
|
|
||||||
lxsspx vs32, o0, A0
|
lxsspx vs32, o0, A0
|
||||||
lxsspx vs33, o4, A0
|
lxsspx vs33, o4, A0
|
||||||
|
@ -290,13 +374,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxsspx vs32, o0, T1
|
stxsspx vs32, o0, T1
|
||||||
stxsspx vs33, o4, T1
|
stxsspx vs33, o4, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=1
|
* Macros for N=1 and M=1
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x1', `
|
||||||
|
#else
|
||||||
.macro COPY_1x1
|
.macro COPY_1x1
|
||||||
|
#endif
|
||||||
|
|
||||||
lxsspx vs32, o0, A0
|
lxsspx vs32, o0, A0
|
||||||
|
|
||||||
|
@ -304,5 +396,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxsspx vs32, o0, T1
|
stxsspx vs32, o0, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -71,10 +71,10 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
|
||||||
"addi %4, %4, 64 \n\t"
|
"addi %4, %4, 64 \n\t"
|
||||||
|
|
||||||
"addic. %2, %2, -16 \n\t"
|
"addic. %2, %2, -16 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
|
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"xvmulsp 40, 32, 36 \n\t" // c * x
|
"xvmulsp 40, 32, 36 \n\t" // c * x
|
||||||
"xvmulsp 41, 33, 36 \n\t"
|
"xvmulsp 41, 33, 36 \n\t"
|
||||||
|
@ -138,9 +138,9 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
|
||||||
"addi %4, %4, 128 \n\t"
|
"addi %4, %4, 128 \n\t"
|
||||||
|
|
||||||
"addic. %2, %2, -16 \n\t"
|
"addic. %2, %2, -16 \n\t"
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
|
|
||||||
"xvmulsp 40, 32, 36 \n\t" // c * x
|
"xvmulsp 40, 32, 36 \n\t" // c * x
|
||||||
"xvmulsp 41, 33, 36 \n\t"
|
"xvmulsp 41, 33, 36 \n\t"
|
||||||
|
|
|
@ -56,10 +56,10 @@ static void sscal_kernel_16 (long n, float *x, float alpha)
|
||||||
"addi %2, %2, 128 \n\t"
|
"addi %2, %2, 128 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -32 \n\t"
|
"addic. %1, %1, -32 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
|
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"xvmulsp 40, 32, %x3 \n\t"
|
"xvmulsp 40, 32, %x3 \n\t"
|
||||||
"xvmulsp 41, 33, %x3 \n\t"
|
"xvmulsp 41, 33, %x3 \n\t"
|
||||||
|
@ -92,9 +92,9 @@ static void sscal_kernel_16 (long n, float *x, float alpha)
|
||||||
"addi %2, %2, 256 \n\t"
|
"addi %2, %2, 256 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -32 \n\t"
|
"addic. %1, %1, -32 \n\t"
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
|
|
||||||
"xvmulsp 40, 32, %x3 \n\t"
|
"xvmulsp 40, 32, %x3 \n\t"
|
||||||
"xvmulsp 41, 33, %x3 \n\t"
|
"xvmulsp 41, 33, %x3 \n\t"
|
||||||
|
@ -147,8 +147,8 @@ static void sscal_kernel_16_zero (long n, float *x)
|
||||||
(
|
(
|
||||||
"xxlxor %x3, %x3, %x3 \n\t"
|
"xxlxor %x3, %x3, %x3 \n\t"
|
||||||
|
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"stxvd2x %x3, 0, %2 \n\t"
|
"stxvd2x %x3, 0, %2 \n\t"
|
||||||
"stxvd2x %x3, %4, %2 \n\t"
|
"stxvd2x %x3, %4, %2 \n\t"
|
||||||
|
@ -162,7 +162,7 @@ static void sscal_kernel_16_zero (long n, float *x)
|
||||||
"addi %2, %2, 128 \n\t"
|
"addi %2, %2, 128 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -32 \n\t"
|
"addic. %1, %1, -32 \n\t"
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"#n=%1 x=%0=%2 t0=%x3 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10"
|
"#n=%1 x=%0=%2 t0=%x3 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10"
|
||||||
:
|
:
|
||||||
|
|
|
@ -39,8 +39,8 @@ static void sswap_kernel_32 (long n, float *x, float *y)
|
||||||
{
|
{
|
||||||
__asm__
|
__asm__
|
||||||
(
|
(
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"lxvd2x 32, 0, %4 \n\t"
|
"lxvd2x 32, 0, %4 \n\t"
|
||||||
"lxvd2x 33, %5, %4 \n\t"
|
"lxvd2x 33, %5, %4 \n\t"
|
||||||
|
@ -83,7 +83,7 @@ static void sswap_kernel_32 (long n, float *x, float *y)
|
||||||
"addi %4, %4, 128 \n\t"
|
"addi %4, %4, 128 \n\t"
|
||||||
|
|
||||||
"addic. %2, %2, -32 \n\t"
|
"addic. %2, %2, -32 \n\t"
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
|
"#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
|
||||||
:
|
:
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -68,10 +68,10 @@ static double zasum_kernel_8 (long n, double *x)
|
||||||
"addi %2, %2, 128 \n\t"
|
"addi %2, %2, 128 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -8 \n\t"
|
"addic. %1, %1, -8 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
|
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"xvabsdp 48, 40 \n\t"
|
"xvabsdp 48, 40 \n\t"
|
||||||
"xvabsdp 49, 41 \n\t"
|
"xvabsdp 49, 41 \n\t"
|
||||||
|
@ -108,9 +108,9 @@ static double zasum_kernel_8 (long n, double *x)
|
||||||
"xvadddp 38, 38, %x5 \n\t"
|
"xvadddp 38, 38, %x5 \n\t"
|
||||||
"xvadddp 39, 39, %x6 \n\t"
|
"xvadddp 39, 39, %x6 \n\t"
|
||||||
|
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
|
|
||||||
"xvabsdp 48, 40 \n\t"
|
"xvabsdp 48, 40 \n\t"
|
||||||
"xvabsdp 49, 41 \n\t"
|
"xvabsdp 49, 41 \n\t"
|
||||||
|
@ -140,7 +140,7 @@ static double zasum_kernel_8 (long n, double *x)
|
||||||
|
|
||||||
"xvadddp 32, 32, 36 \n\t"
|
"xvadddp 32, 32, 36 \n\t"
|
||||||
|
|
||||||
"xxswapd 33, 32 \n\t"
|
XXSWAPD_S(33,32)
|
||||||
"xsadddp %x0, 32, 33 \n"
|
"xsadddp %x0, 32, 33 \n"
|
||||||
|
|
||||||
"#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n"
|
"#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n"
|
||||||
|
|
|
@ -61,8 +61,8 @@ static void zaxpy_kernel_4 (long n, double *x, double *y,
|
||||||
|
|
||||||
__asm__
|
__asm__
|
||||||
(
|
(
|
||||||
"xxspltd 32, %x19, 0 \n\t" // alpha_r
|
XXSPLTD_S(32,%x19,0) // alpha_r
|
||||||
"xxspltd 33, %x20, 0 \n\t" // alpha_i
|
XXSPLTD_S(33,%x20,0) // alpha_i
|
||||||
|
|
||||||
"lxvd2x 36, 0, %21 \n\t" // mvec
|
"lxvd2x 36, 0, %21 \n\t" // mvec
|
||||||
|
|
||||||
|
@ -87,10 +87,10 @@ static void zaxpy_kernel_4 (long n, double *x, double *y,
|
||||||
"lxvd2x 50, %23, %3 \n\t" // y2
|
"lxvd2x 50, %23, %3 \n\t" // y2
|
||||||
"lxvd2x 51, %24, %3 \n\t" // y3
|
"lxvd2x 51, %24, %3 \n\t" // y3
|
||||||
|
|
||||||
"xxswapd %x8, 40 \n\t" // exchange real and imag part
|
XXSWAPD_S(%x8,40) // exchange real and imag part
|
||||||
"xxswapd %x9, 41 \n\t" // exchange real and imag part
|
XXSWAPD_S(%x9,41) // exchange real and imag part
|
||||||
"xxswapd %x10, 42 \n\t" // exchange real and imag part
|
XXSWAPD_S(%x10,42) // exchange real and imag part
|
||||||
"xxswapd %x11, 43 \n\t" // exchange real and imag part
|
XXSWAPD_S(%x11,43) // exchange real and imag part
|
||||||
|
|
||||||
"addi %2, %2, 64 \n\t"
|
"addi %2, %2, 64 \n\t"
|
||||||
"addi %3, %3, 64 \n\t"
|
"addi %3, %3, 64 \n\t"
|
||||||
|
@ -105,19 +105,19 @@ static void zaxpy_kernel_4 (long n, double *x, double *y,
|
||||||
"lxvd2x %x6, %23, %3 \n\t" // y6
|
"lxvd2x %x6, %23, %3 \n\t" // y6
|
||||||
"lxvd2x %x7, %24, %3 \n\t" // y7
|
"lxvd2x %x7, %24, %3 \n\t" // y7
|
||||||
|
|
||||||
"xxswapd %x12, 44 \n\t" // exchange real and imag part
|
XXSWAPD_S(%x12,44) // exchange real and imag part
|
||||||
"xxswapd %x13, 45 \n\t" // exchange real and imag part
|
XXSWAPD_S(%x13,45) // exchange real and imag part
|
||||||
"xxswapd %x14, 46 \n\t" // exchange real and imag part
|
XXSWAPD_S(%x14,46) // exchange real and imag part
|
||||||
"xxswapd %x15, 47 \n\t" // exchange real and imag part
|
XXSWAPD_S(%x15,47) // exchange real and imag part
|
||||||
|
|
||||||
"addi %2, %2, 64 \n\t"
|
"addi %2, %2, 64 \n\t"
|
||||||
"addi %3, %3, 64 \n\t"
|
"addi %3, %3, 64 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -8 \n\t"
|
"addic. %1, %1, -8 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
|
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i
|
"xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i
|
||||||
"xvmaddadp 49, 41, 32 \n\t"
|
"xvmaddadp 49, 41, 32 \n\t"
|
||||||
|
@ -163,31 +163,31 @@ static void zaxpy_kernel_4 (long n, double *x, double *y,
|
||||||
|
|
||||||
"addi %16, %16, 64 \n\t"
|
"addi %16, %16, 64 \n\t"
|
||||||
|
|
||||||
"xxswapd %x8, 40 \n\t" // exchange real and imag part
|
XXSWAPD_S(%x8,40) // exchange real and imag part
|
||||||
"xxswapd %x9, 41 \n\t" // exchange real and imag part
|
XXSWAPD_S(%x9,41) // exchange real and imag part
|
||||||
"lxvd2x 48, 0, %3 \n\t" // y0
|
"lxvd2x 48, 0, %3 \n\t" // y0
|
||||||
"lxvd2x 49, %22, %3 \n\t" // y1
|
"lxvd2x 49, %22, %3 \n\t" // y1
|
||||||
"xxswapd %x10, 42 \n\t" // exchange real and imag part
|
XXSWAPD_S(%x10,42) // exchange real and imag part
|
||||||
"xxswapd %x11, 43 \n\t" // exchange real and imag part
|
XXSWAPD_S(%x11,43) // exchange real and imag part
|
||||||
"lxvd2x 50, %23, %3 \n\t" // y2
|
"lxvd2x 50, %23, %3 \n\t" // y2
|
||||||
"lxvd2x 51, %24, %3 \n\t" // y3
|
"lxvd2x 51, %24, %3 \n\t" // y3
|
||||||
|
|
||||||
"xxswapd %x12, 44 \n\t" // exchange real and imag part
|
XXSWAPD_S(%x12,44) // exchange real and imag part
|
||||||
"addi %3, %3, 64 \n\t"
|
"addi %3, %3, 64 \n\t"
|
||||||
"xxswapd %x13, 45 \n\t" // exchange real and imag part
|
XXSWAPD_S(%x13,45) // exchange real and imag part
|
||||||
"lxvd2x %x4, 0, %3 \n\t" // y4
|
"lxvd2x %x4, 0, %3 \n\t" // y4
|
||||||
"lxvd2x %x5, %22, %3 \n\t" // y5
|
"lxvd2x %x5, %22, %3 \n\t" // y5
|
||||||
"xxswapd %x14, 46 \n\t" // exchange real and imag part
|
XXSWAPD_S(%x14,46) // exchange real and imag part
|
||||||
"xxswapd %x15, 47 \n\t" // exchange real and imag part
|
XXSWAPD_S(%x15,47) // exchange real and imag part
|
||||||
"lxvd2x %x6, %23, %3 \n\t" // y6
|
"lxvd2x %x6, %23, %3 \n\t" // y6
|
||||||
"lxvd2x %x7, %24, %3 \n\t" // y7
|
"lxvd2x %x7, %24, %3 \n\t" // y7
|
||||||
|
|
||||||
"addi %3, %3, 64 \n\t"
|
"addi %3, %3, 64 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -8 \n\t"
|
"addic. %1, %1, -8 \n\t"
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
|
|
||||||
"xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i
|
"xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i
|
||||||
"xvmaddadp 49, 41, 32 \n\t"
|
"xvmaddadp 49, 41, 32 \n\t"
|
||||||
|
|
|
@ -62,10 +62,10 @@ static void zcopy_kernel_16 (long n, FLOAT *x, FLOAT *y)
|
||||||
"addi %2, %2, 128 \n\t"
|
"addi %2, %2, 128 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -16 \n\t"
|
"addic. %1, %1, -16 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
|
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"stxvd2x 32, 0, %3 \n\t"
|
"stxvd2x 32, 0, %3 \n\t"
|
||||||
"stxvd2x 33, %5, %3 \n\t"
|
"stxvd2x 33, %5, %3 \n\t"
|
||||||
|
@ -108,9 +108,9 @@ static void zcopy_kernel_16 (long n, FLOAT *x, FLOAT *y)
|
||||||
"addi %2, %2, 128 \n\t"
|
"addi %2, %2, 128 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -16 \n\t"
|
"addic. %1, %1, -16 \n\t"
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
|
|
||||||
"stxvd2x 32, 0, %3 \n\t"
|
"stxvd2x 32, 0, %3 \n\t"
|
||||||
"stxvd2x 33, %5, %3 \n\t"
|
"stxvd2x 33, %5, %3 \n\t"
|
||||||
|
|
|
@ -60,10 +60,10 @@ static void zdot_kernel_8 (long n, double *x, double *y, double *dot)
|
||||||
"lxvd2x 43, %9, %2 \n\t" // x3_r, x3_i
|
"lxvd2x 43, %9, %2 \n\t" // x3_r, x3_i
|
||||||
"lxvd2x 51, %9, %3 \n\t" // y3_r, y3_i
|
"lxvd2x 51, %9, %3 \n\t" // y3_r, y3_i
|
||||||
|
|
||||||
"xxswapd 0, 48 \n\t" // y0_i, y0_r
|
XXSWAPD_S(0,48) // y0_i, y0_r
|
||||||
"xxswapd 1, 49 \n\t" // y1_i, y1_r
|
XXSWAPD_S(1,49) // y1_i, y1_r
|
||||||
"xxswapd 2, 50 \n\t" // y2_i, y2_r
|
XXSWAPD_S(2,50) // y2_i, y2_r
|
||||||
"xxswapd 3, 51 \n\t" // y3_i, y3_r
|
XXSWAPD_S(3,51) // y3_i, y3_r
|
||||||
|
|
||||||
"addi %2, %2, 64 \n\t"
|
"addi %2, %2, 64 \n\t"
|
||||||
"addi %3, %3, 64 \n\t"
|
"addi %3, %3, 64 \n\t"
|
||||||
|
@ -77,19 +77,19 @@ static void zdot_kernel_8 (long n, double *x, double *y, double *dot)
|
||||||
"lxvd2x 47, %9, %2 \n\t" // x3_r, x3_i
|
"lxvd2x 47, %9, %2 \n\t" // x3_r, x3_i
|
||||||
"lxvd2x 7, %9, %3 \n\t" // y3_r, y3_i
|
"lxvd2x 7, %9, %3 \n\t" // y3_r, y3_i
|
||||||
|
|
||||||
"xxswapd 8, 4 \n\t" // y0_i, y0_r
|
XXSWAPD_S(8,4) // y0_i, y0_r
|
||||||
"xxswapd 9, 5 \n\t" // y1_i, y1_r
|
XXSWAPD_S(9,5) // y1_i, y1_r
|
||||||
"xxswapd 10, 6 \n\t" // y2_i, y2_r
|
XXSWAPD_S(10,6) // y2_i, y2_r
|
||||||
"xxswapd 11, 7 \n\t" // y3_i, y3_r
|
XXSWAPD_S(11,7) // y3_i, y3_r
|
||||||
|
|
||||||
"addi %2, %2, 64 \n\t"
|
"addi %2, %2, 64 \n\t"
|
||||||
"addi %3, %3, 64 \n\t"
|
"addi %3, %3, 64 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -8 \n\t"
|
"addic. %1, %1, -8 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
|
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"xvmaddadp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i
|
"xvmaddadp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i
|
||||||
"lxvd2x 48, 0, %3 \n\t" // y0_r, y0_i
|
"lxvd2x 48, 0, %3 \n\t" // y0_r, y0_i
|
||||||
|
@ -111,14 +111,14 @@ static void zdot_kernel_8 (long n, double *x, double *y, double *dot)
|
||||||
"xvmaddadp 39, 43, 3 \n\t" // x3_r * y3_i , x3_i * y3_r
|
"xvmaddadp 39, 43, 3 \n\t" // x3_r * y3_i , x3_i * y3_r
|
||||||
"lxvd2x 43, %9, %2 \n\t" // x3_r, x3_i
|
"lxvd2x 43, %9, %2 \n\t" // x3_r, x3_i
|
||||||
|
|
||||||
"xxswapd 0,48 \n\t" // y0_i, y0_r
|
XXSWAPD_S(0,48) // y0_i, y0_r
|
||||||
"xxswapd 1,49 \n\t" // y1_i, y1_r
|
XXSWAPD_S(1,49) // y1_i, y1_r
|
||||||
|
|
||||||
"addi %2, %2, 64 \n\t"
|
"addi %2, %2, 64 \n\t"
|
||||||
"addi %3, %3, 64 \n\t"
|
"addi %3, %3, 64 \n\t"
|
||||||
|
|
||||||
"xxswapd 2,50 \n\t" // y2_i, y2_r
|
XXSWAPD_S(2,50) // y2_i, y2_r
|
||||||
"xxswapd 3,51 \n\t" // y3_i, y3_r
|
XXSWAPD_S(3,51) // y3_i, y3_r
|
||||||
|
|
||||||
"xvmaddadp 32, 44, 4 \n\t" // x0_r * y0_r , x0_i * y0_i
|
"xvmaddadp 32, 44, 4 \n\t" // x0_r * y0_r , x0_i * y0_i
|
||||||
"lxvd2x 4, 0, %3 \n\t" // y0_r, y0_i
|
"lxvd2x 4, 0, %3 \n\t" // y0_r, y0_i
|
||||||
|
@ -138,19 +138,19 @@ static void zdot_kernel_8 (long n, double *x, double *y, double *dot)
|
||||||
"xvmaddadp 39, 47, 11 \n\t" // x3_r * y3_i , x3_i * y3_r
|
"xvmaddadp 39, 47, 11 \n\t" // x3_r * y3_i , x3_i * y3_r
|
||||||
"lxvd2x 47, %9, %2 \n\t" // x3_r, x3_i
|
"lxvd2x 47, %9, %2 \n\t" // x3_r, x3_i
|
||||||
|
|
||||||
"xxswapd 8,4 \n\t" // y0_i, y0_r
|
XXSWAPD_S(8,4) // y0_i, y0_r
|
||||||
"xxswapd 9,5 \n\t" // y1_i, y1_r
|
XXSWAPD_S(9,5) // y1_i, y1_r
|
||||||
|
|
||||||
"addi %2, %2, 64 \n\t"
|
"addi %2, %2, 64 \n\t"
|
||||||
"addi %3, %3, 64 \n\t"
|
"addi %3, %3, 64 \n\t"
|
||||||
|
|
||||||
"xxswapd 10,6 \n\t" // y2_i, y2_r
|
XXSWAPD_S(10,6) // y2_i, y2_r
|
||||||
"xxswapd 11,7 \n\t" // y3_i, y3_r
|
XXSWAPD_S(11,7) // y3_i, y3_r
|
||||||
|
|
||||||
"addic. %1, %1, -8 \n\t"
|
"addic. %1, %1, -8 \n\t"
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
|
|
||||||
"xvmaddadp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i
|
"xvmaddadp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i
|
||||||
"xvmaddadp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i
|
"xvmaddadp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
* Macros for N=4 and M=8
|
* Macros for N=4 and M=8
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x8', `
|
||||||
|
#else
|
||||||
.macro COPY_4x8
|
.macro COPY_4x8
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
lxvd2x vs33, o16, A0
|
lxvd2x vs33, o16, A0
|
||||||
|
@ -144,14 +148,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvd2x vs12, o32, T1
|
stxvd2x vs12, o32, T1
|
||||||
stxvd2x vs13, o48, T1
|
stxvd2x vs13, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=4 and M=4
|
* Macros for N=4 and M=4
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x4', `
|
||||||
|
#else
|
||||||
.macro COPY_4x4
|
.macro COPY_4x4
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
lxvd2x vs33, o16, A0
|
lxvd2x vs33, o16, A0
|
||||||
|
@ -209,14 +221,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvd2x vs46, o32, T1
|
stxvd2x vs46, o32, T1
|
||||||
stxvd2x vs47, o48, T1
|
stxvd2x vs47, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=4 and M=2
|
* Macros for N=4 and M=2
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x2', `
|
||||||
|
#else
|
||||||
.macro COPY_4x2
|
.macro COPY_4x2
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
lxvd2x vs33, o16, A0
|
lxvd2x vs33, o16, A0
|
||||||
|
@ -254,14 +274,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvd2x vs38, o32, T1
|
stxvd2x vs38, o32, T1
|
||||||
stxvd2x vs39, o48, T1
|
stxvd2x vs39, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=4 and M=1
|
* Macros for N=4 and M=1
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_4x1', `
|
||||||
|
#else
|
||||||
.macro COPY_4x1
|
.macro COPY_4x1
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
addi A0, A0, 16
|
addi A0, A0, 16
|
||||||
|
@ -289,14 +317,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxvd2x vs35, o48, T1
|
stxvd2x vs35, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=8
|
* Macros for N=2 and M=8
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x8', `
|
||||||
|
#else
|
||||||
.macro COPY_2x8
|
.macro COPY_2x8
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
lxvd2x vs33, o16, A0
|
lxvd2x vs33, o16, A0
|
||||||
|
@ -350,14 +386,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvd2x vs46, o32, T1
|
stxvd2x vs46, o32, T1
|
||||||
stxvd2x vs47, o48, T1
|
stxvd2x vs47, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=4
|
* Macros for N=2 and M=4
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x4', `
|
||||||
|
#else
|
||||||
.macro COPY_2x4
|
.macro COPY_2x4
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
lxvd2x vs33, o16, A0
|
lxvd2x vs33, o16, A0
|
||||||
|
@ -387,14 +431,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvd2x vs38, o32, T1
|
stxvd2x vs38, o32, T1
|
||||||
stxvd2x vs39, o48, T1
|
stxvd2x vs39, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=2
|
* Macros for N=2 and M=2
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x2', `
|
||||||
|
#else
|
||||||
.macro COPY_2x2
|
.macro COPY_2x2
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
lxvd2x vs33, o16, A0
|
lxvd2x vs33, o16, A0
|
||||||
|
@ -414,14 +466,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvd2x vs34, o32, T1
|
stxvd2x vs34, o32, T1
|
||||||
stxvd2x vs35, o48, T1
|
stxvd2x vs35, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=2 and M=1
|
* Macros for N=2 and M=1
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_2x1', `
|
||||||
|
#else
|
||||||
.macro COPY_2x1
|
.macro COPY_2x1
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
addi A0, A0, 16
|
addi A0, A0, 16
|
||||||
|
@ -437,14 +497,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxvd2x vs33, o16, T1
|
stxvd2x vs33, o16, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=8
|
* Macros for N=1 and M=8
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x8', `
|
||||||
|
#else
|
||||||
.macro COPY_1x8
|
.macro COPY_1x8
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
lxvd2x vs33, o16, A0
|
lxvd2x vs33, o16, A0
|
||||||
|
@ -472,14 +540,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvd2x vs38, o32, T1
|
stxvd2x vs38, o32, T1
|
||||||
stxvd2x vs39, o48, T1
|
stxvd2x vs39, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=4
|
* Macros for N=1 and M=4
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x4', `
|
||||||
|
#else
|
||||||
.macro COPY_1x4
|
.macro COPY_1x4
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
lxvd2x vs33, o16, A0
|
lxvd2x vs33, o16, A0
|
||||||
|
@ -495,14 +571,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvd2x vs34, o32, T1
|
stxvd2x vs34, o32, T1
|
||||||
stxvd2x vs35, o48, T1
|
stxvd2x vs35, o48, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=2
|
* Macros for N=1 and M=2
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x2', `
|
||||||
|
#else
|
||||||
.macro COPY_1x2
|
.macro COPY_1x2
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
lxvd2x vs33, o16, A0
|
lxvd2x vs33, o16, A0
|
||||||
|
@ -514,14 +598,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvd2x vs32, o0, T1
|
stxvd2x vs32, o0, T1
|
||||||
stxvd2x vs33, o16, T1
|
stxvd2x vs33, o16, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************************************
|
/**********************************************************************************************
|
||||||
* Macros for N=1 and M=1
|
* Macros for N=1 and M=1
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
define(`COPY_1x1', `
|
||||||
|
#else
|
||||||
.macro COPY_1x1
|
.macro COPY_1x1
|
||||||
|
#endif
|
||||||
|
|
||||||
lxvd2x vs32, o0, A0
|
lxvd2x vs32, o0, A0
|
||||||
addi A0, A0, 16
|
addi A0, A0, 16
|
||||||
|
@ -531,5 +623,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
stxvd2x vs32, o0, T1
|
stxvd2x vs32, o0, T1
|
||||||
|
|
||||||
|
#if defined(_AIX)
|
||||||
|
')
|
||||||
|
#else
|
||||||
.endm
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -40,8 +40,8 @@ static void zrot_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT si
|
||||||
|
|
||||||
__asm__
|
__asm__
|
||||||
(
|
(
|
||||||
"xxspltd 36, %x[cos], 0 \n\t" // load c to both dwords
|
XXSPLTD_S(36,%x[cos],0) // load c to both dwords
|
||||||
"xxspltd 37, %x[sin], 0 \n\t" // load s to both dwords
|
XXSPLTD_S(37,%x[sin],0) // load s to both dwords
|
||||||
|
|
||||||
"lxvd2x 32, 0, %[x_ptr] \n\t" // load x
|
"lxvd2x 32, 0, %[x_ptr] \n\t" // load x
|
||||||
"lxvd2x 33, %[i16], %[x_ptr] \n\t"
|
"lxvd2x 33, %[i16], %[x_ptr] \n\t"
|
||||||
|
@ -57,10 +57,10 @@ static void zrot_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT si
|
||||||
"addi %[y_ptr], %[y_ptr], 64 \n\t"
|
"addi %[y_ptr], %[y_ptr], 64 \n\t"
|
||||||
|
|
||||||
"addic. %[temp_n], %[temp_n], -4 \n\t"
|
"addic. %[temp_n], %[temp_n], -4 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
|
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"xvmuldp 40, 32, 36 \n\t" // c * x
|
"xvmuldp 40, 32, 36 \n\t" // c * x
|
||||||
"xvmuldp 41, 33, 36 \n\t"
|
"xvmuldp 41, 33, 36 \n\t"
|
||||||
|
@ -124,9 +124,9 @@ static void zrot_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT si
|
||||||
"addi %[y_ptr], %[y_ptr], 128 \n\t"
|
"addi %[y_ptr], %[y_ptr], 128 \n\t"
|
||||||
|
|
||||||
"addic. %[temp_n], %[temp_n], -4 \n\t"
|
"addic. %[temp_n], %[temp_n], -4 \n\t"
|
||||||
"bgt+ 1b \n"
|
"bgt+ one%= \n"
|
||||||
|
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
|
|
||||||
"xvmuldp 40, 32, 36 \n\t" // c * x
|
"xvmuldp 40, 32, 36 \n\t" // c * x
|
||||||
"xvmuldp 41, 33, 36 \n\t"
|
"xvmuldp 41, 33, 36 \n\t"
|
||||||
|
|
|
@ -58,8 +58,8 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
|
||||||
"dcbt 0, %2 \n\t"
|
"dcbt 0, %2 \n\t"
|
||||||
|
|
||||||
"xsnegdp 33, %x16 \n\t" // -alpha_i
|
"xsnegdp 33, %x16 \n\t" // -alpha_i
|
||||||
"xxspltd 32, %x15, 0 \n\t" // alpha_r , alpha_r
|
XXSPLTD_S(32,%x15,0) // alpha_r , alpha_r
|
||||||
"xxmrghd 33, 33, %x16 \n\t" // -alpha_i , alpha_i
|
XXMRGHD_S(33,33,%x16) // -alpha_i , alpha_i
|
||||||
|
|
||||||
"lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i
|
"lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i
|
||||||
"lxvd2x 41, %17, %2 \n\t"
|
"lxvd2x 41, %17, %2 \n\t"
|
||||||
|
@ -73,10 +73,10 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
|
||||||
"addi %2, %2, 128 \n\t"
|
"addi %2, %2, 128 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -8 \n\t"
|
"addic. %1, %1, -8 \n\t"
|
||||||
"ble 2f \n\t"
|
"ble two%= \n\t"
|
||||||
|
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
|
|
||||||
"xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r
|
"xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r
|
||||||
"xvmuldp 49, 41, 32 \n\t"
|
"xvmuldp 49, 41, 32 \n\t"
|
||||||
|
@ -87,14 +87,14 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
|
||||||
"xvmuldp %x5, 46, 32 \n\t"
|
"xvmuldp %x5, 46, 32 \n\t"
|
||||||
"xvmuldp %x6, 47, 32 \n\t"
|
"xvmuldp %x6, 47, 32 \n\t"
|
||||||
|
|
||||||
"xxswapd %x7, 40 \n\t"
|
XXSWAPD_S(%x7,40)
|
||||||
"xxswapd %x8, 41 \n\t"
|
XXSWAPD_S(%x8,41)
|
||||||
"xxswapd %x9, 42 \n\t"
|
XXSWAPD_S(%x9,42)
|
||||||
"xxswapd %x10, 43 \n\t"
|
XXSWAPD_S(%x10,43)
|
||||||
"xxswapd %x11, 44 \n\t"
|
XXSWAPD_S(%x11,44)
|
||||||
"xxswapd %x12, 45 \n\t"
|
XXSWAPD_S(%x12,45)
|
||||||
"xxswapd %x13, 46 \n\t"
|
XXSWAPD_S(%x13,46)
|
||||||
"xxswapd %x14, 47 \n\t"
|
XXSWAPD_S(%x14,47)
|
||||||
|
|
||||||
"xvmuldp %x7, %x7, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i
|
"xvmuldp %x7, %x7, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i
|
||||||
"xvmuldp %x8, %x8, 33 \n\t"
|
"xvmuldp %x8, %x8, 33 \n\t"
|
||||||
|
@ -147,9 +147,9 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
|
||||||
"addi %2, %2, 256 \n\t"
|
"addi %2, %2, 256 \n\t"
|
||||||
|
|
||||||
"addic. %1, %1, -8 \n\t"
|
"addic. %1, %1, -8 \n\t"
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"2: \n\t"
|
"two%=: \n\t"
|
||||||
|
|
||||||
"xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r
|
"xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r
|
||||||
"xvmuldp 49, 41, 32 \n\t"
|
"xvmuldp 49, 41, 32 \n\t"
|
||||||
|
@ -160,14 +160,14 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
|
||||||
"xvmuldp %x5, 46, 32 \n\t"
|
"xvmuldp %x5, 46, 32 \n\t"
|
||||||
"xvmuldp %x6, 47, 32 \n\t"
|
"xvmuldp %x6, 47, 32 \n\t"
|
||||||
|
|
||||||
"xxswapd %x7, 40 \n\t"
|
XXSWAPD_S(%x7,40)
|
||||||
"xxswapd %x8, 41 \n\t"
|
XXSWAPD_S(%x8,41)
|
||||||
"xxswapd %x9, 42 \n\t"
|
XXSWAPD_S(%x9,42)
|
||||||
"xxswapd %x10, 43 \n\t"
|
XXSWAPD_S(%x10,43)
|
||||||
"xxswapd %x11, 44 \n\t"
|
XXSWAPD_S(%x11,44)
|
||||||
"xxswapd %x12, 45 \n\t"
|
XXSWAPD_S(%x12,45)
|
||||||
"xxswapd %x13, 46 \n\t"
|
XXSWAPD_S(%x13,46)
|
||||||
"xxswapd %x14, 47 \n\t"
|
XXSWAPD_S(%x14,47)
|
||||||
|
|
||||||
"addi %2, %2, -128 \n\t"
|
"addi %2, %2, -128 \n\t"
|
||||||
|
|
||||||
|
|
|
@ -40,8 +40,8 @@ zswap_kernel_16 (long n, double *x, double *y)
|
||||||
{
|
{
|
||||||
__asm__
|
__asm__
|
||||||
(
|
(
|
||||||
".p2align 5 \n"
|
".align 5 \n"
|
||||||
"1: \n\t"
|
"one%=: \n\t"
|
||||||
"lxvd2x 32, 0, %4 \n\t"
|
"lxvd2x 32, 0, %4 \n\t"
|
||||||
"lxvd2x 33, %5, %4 \n\t"
|
"lxvd2x 33, %5, %4 \n\t"
|
||||||
"lxvd2x 34, %6, %4 \n\t"
|
"lxvd2x 34, %6, %4 \n\t"
|
||||||
|
@ -130,7 +130,7 @@ zswap_kernel_16 (long n, double *x, double *y)
|
||||||
|
|
||||||
"addi %4, %4, 128 \n\t"
|
"addi %4, %4, 128 \n\t"
|
||||||
"addic. %2, %2, -16 \n\t"
|
"addic. %2, %2, -16 \n\t"
|
||||||
"bgt 1b \n"
|
"bgt one%= \n"
|
||||||
|
|
||||||
"#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
|
"#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
|
||||||
:
|
:
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -56,7 +56,7 @@ DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
CTRMMKERNEL = cgemm_kernel_8x2_haswell.S
|
CTRMMKERNEL = cgemm_kernel_8x2_haswell.S
|
||||||
CGEMMKERNEL = cgemm_kernel_8x2_haswell.S
|
CGEMMKERNEL = cgemm_kernel_8x2_haswell.c
|
||||||
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
||||||
CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
|
CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
|
||||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||||
|
@ -67,7 +67,7 @@ CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
ZTRMMKERNEL = zgemm_kernel_4x2_haswell.S
|
ZTRMMKERNEL = zgemm_kernel_4x2_haswell.S
|
||||||
ZGEMMKERNEL = zgemm_kernel_4x2_haswell.S
|
ZGEMMKERNEL = zgemm_kernel_4x2_haswell.c
|
||||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_4.c
|
ZGEMMINCOPY = ../generic/zgemm_ncopy_4.c
|
||||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_4.c
|
ZGEMMITCOPY = ../generic/zgemm_tcopy_4.c
|
||||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||||
|
@ -97,6 +97,6 @@ ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S
|
CGEMM3MKERNEL = cgemm3m_kernel_8x4_haswell.c
|
||||||
ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S
|
ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S
|
||||||
|
|
||||||
|
|
|
@ -53,7 +53,7 @@ DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
CTRMMKERNEL = cgemm_kernel_8x2_haswell.S
|
CTRMMKERNEL = cgemm_kernel_8x2_haswell.S
|
||||||
CGEMMKERNEL = cgemm_kernel_8x2_haswell.S
|
CGEMMKERNEL = cgemm_kernel_8x2_haswell.c
|
||||||
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
||||||
CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
|
CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
|
||||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||||
|
@ -64,7 +64,7 @@ CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
ZTRMMKERNEL = zgemm_kernel_4x2_haswell.S
|
ZTRMMKERNEL = zgemm_kernel_4x2_haswell.S
|
||||||
ZGEMMKERNEL = zgemm_kernel_4x2_haswell.S
|
ZGEMMKERNEL = zgemm_kernel_4x2_haswell.c
|
||||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_4.c
|
ZGEMMINCOPY = ../generic/zgemm_ncopy_4.c
|
||||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_4.c
|
ZGEMMITCOPY = ../generic/zgemm_tcopy_4.c
|
||||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||||
|
@ -94,6 +94,6 @@ ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S
|
CGEMM3MKERNEL = cgemm3m_kernel_8x4_haswell.c
|
||||||
ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S
|
ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,279 @@
|
||||||
|
/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store */
|
||||||
|
/* r12 = k << 4(const), r13 = k(const), r14 = b_head_pos(const), r15 = tmp */
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
//recommended settings: GEMM_P = 320, GEMM_Q = 320.
|
||||||
|
|
||||||
|
/* m = 8 *//* ymm0 for alpha, ymm1-ymm3 for temporary use, ymm4-ymm15 for accumulators */
|
||||||
|
#define KERNEL_k1m8n1 \
|
||||||
|
"vmovups (%0),%%ymm1; addq $32,%0;"\
|
||||||
|
"vbroadcastss (%1),%%ymm2; vfmadd231ps %%ymm1,%%ymm2,%%ymm4;"\
|
||||||
|
"addq $4,%1;"
|
||||||
|
#define KERNEL_h_k1m8n2 \
|
||||||
|
"vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; addq $32,%0;"\
|
||||||
|
"vbroadcastsd (%1),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"
|
||||||
|
#define KERNEL_k1m8n2 KERNEL_h_k1m8n2 "addq $8,%1;"
|
||||||
|
#define KERNEL_h_k1m8n4 \
|
||||||
|
KERNEL_h_k1m8n2 "vbroadcastsd 8(%1),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"
|
||||||
|
#define KERNEL_k1m8n4 KERNEL_h_k1m8n4 "addq $16,%1;"
|
||||||
|
#define unit_kernel_k1m8n4(c1,c2,c3,c4,...) \
|
||||||
|
"vbroadcastsd ("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c1"; vfmadd231ps %%ymm2,%%ymm3,"#c2";"\
|
||||||
|
"vbroadcastsd 8("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c3"; vfmadd231ps %%ymm2,%%ymm3,"#c4";"
|
||||||
|
#define KERNEL_h_k1m8n8 KERNEL_h_k1m8n4 unit_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,%1,%%r12,1)
|
||||||
|
#define KERNEL_k1m8n8 KERNEL_h_k1m8n8 "addq $16,%1;"
|
||||||
|
#define KERNEL_h_k1m8n12 KERNEL_h_k1m8n8 unit_kernel_k1m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,%1,%%r12,2)
|
||||||
|
#define KERNEL_k1m8n12 KERNEL_h_k1m8n12 "addq $16,%1;"
|
||||||
|
#define INIT_m8n1 "vpxor %%ymm4,%%ymm4,%%ymm4;"
|
||||||
|
#define INIT_m8n2 INIT_m8n1 "vpxor %%ymm5,%%ymm5,%%ymm5;"
|
||||||
|
#define INIT_m8n4 INIT_m8n2 "vpxor %%ymm6,%%ymm6,%%ymm6;vpxor %%ymm7,%%ymm7,%%ymm7;"
|
||||||
|
#define unit_init_m8n4(c1,c2,c3,c4) \
|
||||||
|
"vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";"
|
||||||
|
#define INIT_m8n8 INIT_m8n4 unit_init_m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11)
|
||||||
|
#define INIT_m8n12 INIT_m8n8 unit_init_m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15)
|
||||||
|
#define SAVE_m8n1 \
|
||||||
|
"vunpcklps %%ymm4,%%ymm4,%%ymm2; vunpckhps %%ymm4,%%ymm4,%%ymm3;"\
|
||||||
|
"vperm2f128 $2,%%ymm2,%%ymm3,%%ymm1; vperm2f128 $19,%%ymm2,%%ymm3,%%ymm2;"\
|
||||||
|
"vfmadd213ps (%2),%%ymm0,%%ymm1; vfmadd213ps 32(%2),%%ymm0,%%ymm2; vmovups %%ymm1,(%2); vmovups %%ymm2,32(%2);"
|
||||||
|
#define unit_save_m8n2(c1,c2) \
|
||||||
|
"vunpcklpd "#c2","#c1",%%ymm2; vunpckhpd "#c2","#c1",%%ymm3;"\
|
||||||
|
"vperm2f128 $2,%%ymm2,%%ymm3,"#c1"; vperm2f128 $19,%%ymm2,%%ymm3,"#c2";"\
|
||||||
|
"vmovsldup "#c1",%%ymm2; vmovsldup "#c2",%%ymm3;"\
|
||||||
|
"vfmadd213ps (%5),%%ymm0,%%ymm2; vfmadd213ps 32(%5),%%ymm0,%%ymm3; vmovups %%ymm2,(%5); vmovups %%ymm3,32(%5);"\
|
||||||
|
"vmovshdup "#c1",%%ymm2; vmovshdup "#c2",%%ymm3;"\
|
||||||
|
"vfmadd213ps (%5,%3,1),%%ymm0,%%ymm2; vfmadd213ps 32(%5,%3,1),%%ymm0,%%ymm3; vmovups %%ymm2,(%5,%3,1); vmovups %%ymm3,32(%5,%3,1);"\
|
||||||
|
"leaq (%5,%3,2),%5;"
|
||||||
|
#define SAVE_m8n2 "movq %2,%5;" unit_save_m8n2(%%ymm4,%%ymm5)
|
||||||
|
#define SAVE_m8n4 SAVE_m8n2 unit_save_m8n2(%%ymm6,%%ymm7)
|
||||||
|
#define SAVE_m8n8 SAVE_m8n4 unit_save_m8n2(%%ymm8,%%ymm9) unit_save_m8n2(%%ymm10,%%ymm11)
|
||||||
|
#define SAVE_m8n12 SAVE_m8n8 unit_save_m8n2(%%ymm12,%%ymm13) unit_save_m8n2(%%ymm14,%%ymm15)
|
||||||
|
#define COMPUTE_m8(ndim) \
|
||||||
|
INIT_m8n##ndim\
|
||||||
|
"movq %%r13,%4; movq %%r14,%1; movq %2,%5; xorq %%r15,%%r15;"\
|
||||||
|
"cmpq $24,%4; jb "#ndim"882f;"\
|
||||||
|
#ndim"881:\n\t"\
|
||||||
|
"cmpq $126,%%r15; movq $126,%%r15; cmoveq %3,%%r15;"\
|
||||||
|
"prefetcht0 64(%1); prefetcht0 64(%1,%%r12,1); prefetcht0 64(%1,%%r12,2);"\
|
||||||
|
"prefetcht0 512(%0);" KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\
|
||||||
|
"prefetcht0 512(%0);" KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\
|
||||||
|
"prefetcht1 (%5); leaq -63(%5,%%r15,1),%5;"\
|
||||||
|
"prefetcht0 64(%1); prefetcht0 64(%1,%%r12,1); prefetcht0 64(%1,%%r12,2);"\
|
||||||
|
"prefetcht0 512(%0);" KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\
|
||||||
|
"prefetcht0 512(%0);" KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\
|
||||||
|
"prefetcht1 (%8); addq $16,%8;"\
|
||||||
|
"subq $8,%4; cmpq $24,%4; jnb "#ndim"881b;"\
|
||||||
|
"movq %2,%5;"\
|
||||||
|
#ndim"882:\n\t"\
|
||||||
|
"testq %4,%4; jz "#ndim"883f;"\
|
||||||
|
"prefetcht0 (%5); prefetcht0 63(%5); addq %3,%5;"\
|
||||||
|
KERNEL_k1m8n##ndim\
|
||||||
|
"decq %4; jmp "#ndim"882b;"\
|
||||||
|
#ndim"883:\n\t"\
|
||||||
|
"prefetcht0 (%%r14); prefetcht0 64(%%r14);"\
|
||||||
|
SAVE_m8n##ndim "addq $64,%2;"
|
||||||
|
|
||||||
|
/* m = 4 *//* xmm0 for alpha, xmm1-xmm3 for temporary use, xmm4-xmm15 for accumulators */
|
||||||
|
#define KERNEL_k1m4n1 \
|
||||||
|
"vmovups (%0),%%xmm1; addq $16,%0;"\
|
||||||
|
"vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\
|
||||||
|
"addq $4,%1;"
|
||||||
|
#define KERNEL_h_k1m4n2 \
|
||||||
|
"vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2; addq $16,%0;"\
|
||||||
|
"vmovddup (%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm4; vfmadd231ps %%xmm2,%%xmm3,%%xmm5;"
|
||||||
|
#define KERNEL_k1m4n2 KERNEL_h_k1m4n2 "addq $8,%1;"
|
||||||
|
#define KERNEL_h_k1m4n4 \
|
||||||
|
KERNEL_h_k1m4n2 "vmovddup 8(%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm6; vfmadd231ps %%xmm2,%%xmm3,%%xmm7;"
|
||||||
|
#define KERNEL_k1m4n4 KERNEL_h_k1m4n4 "addq $16,%1;"
|
||||||
|
#define unit_kernel_k1m4n4(c1,c2,c3,c4,...) \
|
||||||
|
"vmovddup ("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c1"; vfmadd231ps %%xmm2,%%xmm3,"#c2";"\
|
||||||
|
"vmovddup 8("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c3"; vfmadd231ps %%xmm2,%%xmm3,"#c4";"
|
||||||
|
#define KERNEL_h_k1m4n8 KERNEL_h_k1m4n4 unit_kernel_k1m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11,%1,%%r12,1)
|
||||||
|
#define KERNEL_k1m4n8 KERNEL_h_k1m4n8 "addq $16,%1;"
|
||||||
|
#define KERNEL_h_k1m4n12 KERNEL_h_k1m4n8 unit_kernel_k1m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15,%1,%%r12,2)
|
||||||
|
#define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $16,%1;"
|
||||||
|
#define INIT_m4n1 "vpxor %%xmm4,%%xmm4,%%xmm4;"
|
||||||
|
#define INIT_m4n2 INIT_m4n1 "vpxor %%xmm5,%%xmm5,%%xmm5;"
|
||||||
|
#define INIT_m4n4 INIT_m4n2 "vpxor %%xmm6,%%xmm6,%%xmm6;vpxor %%xmm7,%%xmm7,%%xmm7;"
|
||||||
|
#define unit_init_m4n4(c1,c2,c3,c4) \
|
||||||
|
"vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";"
|
||||||
|
#define INIT_m4n8 INIT_m4n4 unit_init_m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11)
|
||||||
|
#define INIT_m4n12 INIT_m4n8 unit_init_m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15)
|
||||||
|
#define SAVE_m4n1 \
|
||||||
|
"vunpcklps %%xmm4,%%xmm4,%%xmm2; vunpckhps %%xmm4,%%xmm4,%%xmm3;"\
|
||||||
|
"vfmadd213ps (%2),%%xmm0,%%xmm2; vfmadd213ps 16(%2),%%xmm0,%%xmm3; vmovups %%xmm2,(%2); vmovups %%xmm3,16(%2);"
|
||||||
|
#define unit_save_m4n2(c1,c2) \
|
||||||
|
"vunpcklpd "#c2","#c1",%%xmm2; vunpckhpd "#c2","#c1","#c2"; vmovapd %%xmm2,"#c1";"\
|
||||||
|
"vmovsldup "#c1",%%xmm2; vmovsldup "#c2",%%xmm3;"\
|
||||||
|
"vfmadd213ps (%5),%%xmm0,%%xmm2; vfmadd213ps 16(%5),%%xmm0,%%xmm3; vmovups %%xmm2,(%5); vmovups %%xmm3,16(%5);"\
|
||||||
|
"vmovshdup "#c1",%%xmm2; vmovshdup "#c2",%%xmm3;"\
|
||||||
|
"vfmadd213ps (%5,%3,1),%%xmm0,%%xmm2; vfmadd213ps 16(%5,%3,1),%%xmm0,%%xmm3; vmovups %%xmm2,(%5,%3,1); vmovups %%xmm3,16(%5,%3,1);"\
|
||||||
|
"leaq (%5,%3,2),%5;"
|
||||||
|
#define SAVE_m4n2 "movq %2,%5;" unit_save_m4n2(%%xmm4,%%xmm5)
|
||||||
|
#define SAVE_m4n4 SAVE_m4n2 unit_save_m4n2(%%xmm6,%%xmm7)
|
||||||
|
#define SAVE_m4n8 SAVE_m4n4 unit_save_m4n2(%%xmm8,%%xmm9) unit_save_m4n2(%%xmm10,%%xmm11)
|
||||||
|
#define SAVE_m4n12 SAVE_m4n8 unit_save_m4n2(%%xmm12,%%xmm13) unit_save_m4n2(%%xmm14,%%xmm15)
|
||||||
|
#define COMPUTE_m4(ndim) \
|
||||||
|
INIT_m4n##ndim\
|
||||||
|
"movq %%r13,%4; movq %%r14,%1;"\
|
||||||
|
#ndim"442:\n\t"\
|
||||||
|
"testq %4,%4; jz "#ndim"443f;"\
|
||||||
|
KERNEL_k1m4n##ndim\
|
||||||
|
"decq %4; jmp "#ndim"442b;"\
|
||||||
|
#ndim"443:\n\t"\
|
||||||
|
SAVE_m4n##ndim "addq $32,%2;"
|
||||||
|
|
||||||
|
/* m = 2 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm9 for accumulators */
|
||||||
|
#define INIT_m2n1 "vpxor %%xmm4,%%xmm4,%%xmm4;"
|
||||||
|
#define KERNEL_k1m2n1 \
|
||||||
|
"vmovsd (%0),%%xmm1; addq $8,%0;"\
|
||||||
|
"vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\
|
||||||
|
"addq $4,%1;"
|
||||||
|
#define SAVE_m2n1 \
|
||||||
|
"vunpcklps %%xmm4,%%xmm4,%%xmm1; vfmadd213ps (%2),%%xmm0,%%xmm1; vmovups %%xmm1,(%2);"
|
||||||
|
#define INIT_m2n2 INIT_m2n1 "vpxor %%xmm5,%%xmm5,%%xmm5;"
|
||||||
|
#define KERNEL_k1m2n2 \
|
||||||
|
"vmovsd (%0),%%xmm1; addq $8,%0;"\
|
||||||
|
"vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\
|
||||||
|
"vbroadcastss 4(%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm5;"\
|
||||||
|
"addq $8,%1;"
|
||||||
|
#define SAVE_m2n2 SAVE_m2n1 \
|
||||||
|
"vunpcklps %%xmm5,%%xmm5,%%xmm1; vfmadd213ps (%2,%3,1),%%xmm0,%%xmm1; vmovups %%xmm1,(%2,%3,1);"
|
||||||
|
#define INIT_m2n4 INIT_m2n2
|
||||||
|
#define INIT_m2n8 INIT_m2n4 "vpxor %%xmm6,%%xmm6,%%xmm6; vpxor %%xmm7,%%xmm7,%%xmm7;"
|
||||||
|
#define INIT_m2n12 INIT_m2n8 "vpxor %%xmm8,%%xmm8,%%xmm8; vpxor %%xmm9,%%xmm9,%%xmm9;"
|
||||||
|
#define KERNEL_k1m2n4 \
|
||||||
|
"vmovups (%1),%%xmm3; addq $16,%1;"\
|
||||||
|
"vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\
|
||||||
|
"vbroadcastss 4(%0),%%xmm2; vfmadd231ps %%xmm3,%%xmm2,%%xmm5;"\
|
||||||
|
"addq $8,%0;"
|
||||||
|
#define KERNEL_k1m2n8 \
|
||||||
|
"vmovups (%1),%%xmm3; vmovups (%1,%%r12,1),%%xmm2; addq $16,%1;"\
|
||||||
|
"vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4; vfmadd231ps %%xmm2,%%xmm1,%%xmm6;"\
|
||||||
|
"vbroadcastss 4(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm5; vfmadd231ps %%xmm2,%%xmm1,%%xmm7;"\
|
||||||
|
"addq $8,%0;"
|
||||||
|
#define KERNEL_k1m2n12 \
|
||||||
|
"vmovups (%1),%%xmm3; vmovups (%1,%%r12,1),%%xmm2; vmovups (%1,%%r12,2),%%xmm1; addq $16,%1;"\
|
||||||
|
"vbroadcastss (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm6; vfmadd231ps %%xmm1,%%xmm10,%%xmm8;"\
|
||||||
|
"vbroadcastss 4(%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm5; vfmadd231ps %%xmm2,%%xmm10,%%xmm7; vfmadd231ps %%xmm1,%%xmm10,%%xmm9;"\
|
||||||
|
"addq $8,%0;"
|
||||||
|
#define unit_save_m2n4(c1,c2) \
|
||||||
|
"vunpcklpd "#c2","#c1",%%xmm1; vunpckhpd "#c2","#c1",%%xmm2;"\
|
||||||
|
"vmovsldup %%xmm1,%%xmm3; vfmadd213ps (%5),%%xmm0,%%xmm3; vmovups %%xmm3,(%5);"\
|
||||||
|
"vmovshdup %%xmm1,%%xmm3; vfmadd213ps (%5,%3,1),%%xmm0,%%xmm3; vmovups %%xmm3,(%5,%3,1);"\
|
||||||
|
"leaq (%5,%3,2),%5;"\
|
||||||
|
"vmovsldup %%xmm2,%%xmm3; vfmadd213ps (%5),%%xmm0,%%xmm3; vmovups %%xmm3,(%5);"\
|
||||||
|
"vmovshdup %%xmm2,%%xmm3; vfmadd213ps (%5,%3,1),%%xmm0,%%xmm3; vmovups %%xmm3,(%5,%3,1);"\
|
||||||
|
"leaq (%5,%3,2),%5;"
|
||||||
|
#define SAVE_m2n4 "movq %2,%5;" unit_save_m2n4(%%xmm4,%%xmm5)
|
||||||
|
#define SAVE_m2n8 SAVE_m2n4 unit_save_m2n4(%%xmm6,%%xmm7)
|
||||||
|
#define SAVE_m2n12 SAVE_m2n8 unit_save_m2n4(%%xmm8,%%xmm9)
|
||||||
|
#define COMPUTE_m2(ndim) \
|
||||||
|
INIT_m2n##ndim\
|
||||||
|
"movq %%r13,%4; movq %%r14,%1;"\
|
||||||
|
#ndim"222:\n\t"\
|
||||||
|
"testq %4,%4; jz "#ndim"223f;"\
|
||||||
|
KERNEL_k1m2n##ndim\
|
||||||
|
"decq %4; jmp "#ndim"222b;"\
|
||||||
|
#ndim"223:\n\t"\
|
||||||
|
SAVE_m2n##ndim "addq $16,%2;"
|
||||||
|
|
||||||
|
/* m = 1 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm6 for accumulators */
|
||||||
|
#define INIT_m1n1 "vpxor %%xmm4,%%xmm4,%%xmm4;"
|
||||||
|
#define KERNEL_k1m1n1 \
|
||||||
|
"vmovss (%1),%%xmm3; addq $4,%1;"\
|
||||||
|
"vmovss (%0),%%xmm1; vfmadd231ss %%xmm3,%%xmm1,%%xmm4;"\
|
||||||
|
"addq $4,%0;"
|
||||||
|
#define SAVE_m1n1 \
|
||||||
|
"vunpcklps %%xmm4,%%xmm4,%%xmm4; vmovsd (%2),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm4; vmovsd %%xmm4,(%2);"
|
||||||
|
#define INIT_m1n2 INIT_m1n1
|
||||||
|
#define KERNEL_k1m1n2 \
|
||||||
|
"vmovsd (%1),%%xmm3; addq $8,%1;"\
|
||||||
|
"vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\
|
||||||
|
"addq $4,%0;"
|
||||||
|
#define SAVE_m1n2 \
|
||||||
|
"vunpcklps %%xmm4,%%xmm4,%%xmm4; vmovsd (%2),%%xmm3; vmovhpd (%2,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm4;"\
|
||||||
|
"vmovsd %%xmm4,(%2); vmovhpd %%xmm4,(%2,%3,1);"
|
||||||
|
#define INIT_m1n4 INIT_m1n2
|
||||||
|
#define INIT_m1n8 INIT_m1n4 "vpxor %%xmm5,%%xmm5,%%xmm5;"
|
||||||
|
#define INIT_m1n12 INIT_m1n8 "vpxor %%xmm6,%%xmm6,%%xmm6;"
|
||||||
|
#define KERNEL_k1m1n4 \
|
||||||
|
"vmovups (%1),%%xmm3; addq $16,%1;"\
|
||||||
|
"vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\
|
||||||
|
"addq $4,%0;"
|
||||||
|
#define KERNEL_k1m1n8 \
|
||||||
|
"vmovups (%1),%%xmm3; vmovups (%1,%%r12,1),%%xmm2; addq $16,%1;"\
|
||||||
|
"vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4; vfmadd231ps %%xmm2,%%xmm1,%%xmm5;"\
|
||||||
|
"addq $4,%0;"
|
||||||
|
#define KERNEL_k1m1n12 \
|
||||||
|
"vmovups (%1),%%xmm3; vmovups (%1,%%r12,1),%%xmm2; vmovups (%1,%%r12,2),%%xmm1; addq $16,%1;"\
|
||||||
|
"vbroadcastss (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm5; vfmadd231ps %%xmm1,%%xmm10,%%xmm6;"\
|
||||||
|
"addq $4,%0;"
|
||||||
|
#define unit_save_m1n4(c1) \
|
||||||
|
"vunpcklps "#c1","#c1",%%xmm1; vunpckhps "#c1","#c1",%%xmm2;"\
|
||||||
|
"vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm1;"\
|
||||||
|
"vmovsd %%xmm1,(%5); vmovhpd %%xmm1,(%5,%3,1); leaq (%5,%3,2),%5;"\
|
||||||
|
"vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm2;"\
|
||||||
|
"vmovsd %%xmm2,(%5); vmovhpd %%xmm2,(%5,%3,1); leaq (%5,%3,2),%5;"
|
||||||
|
#define SAVE_m1n4 "movq %2,%5;" unit_save_m1n4(%%xmm4)
|
||||||
|
#define SAVE_m1n8 SAVE_m1n4 unit_save_m1n4(%%xmm5)
|
||||||
|
#define SAVE_m1n12 SAVE_m1n8 unit_save_m1n4(%%xmm6)
|
||||||
|
#define COMPUTE_m1(ndim) \
|
||||||
|
INIT_m1n##ndim\
|
||||||
|
"movq %%r13,%4; movq %%r14,%1;"\
|
||||||
|
#ndim"112:\n\t"\
|
||||||
|
"testq %4,%4; jz "#ndim"113f;"\
|
||||||
|
KERNEL_k1m1n##ndim\
|
||||||
|
"decq %4; jmp "#ndim"112b;"\
|
||||||
|
#ndim"113:\n\t"\
|
||||||
|
SAVE_m1n##ndim "addq $8,%2;"
|
||||||
|
|
||||||
|
/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 = "+r"(K), %5 = "+r"(ctemp) */
|
||||||
|
/* %6 = "+r"(&alpha), %7 = "+r"(M), %8 = "+r"(next_b) */
|
||||||
|
/* r11 = m(const), r12 = k << 4(const), r13 = k(const), r14 = b_head_pos(const),r15 = tmp */
|
||||||
|
|
||||||
|
#define COMPUTE(ndim) {\
|
||||||
|
next_b = b_pointer + ndim * K;\
|
||||||
|
__asm__ __volatile__(\
|
||||||
|
"vbroadcastsd (%6),%%ymm0;"\
|
||||||
|
"movq %4,%%r13; movq %4,%%r12; salq $4,%%r12; movq %1,%%r14; movq %7,%%r11;"\
|
||||||
|
"cmpq $8,%7;jb 33101"#ndim"f;"\
|
||||||
|
"33109"#ndim":\n\t"\
|
||||||
|
COMPUTE_m8(ndim)\
|
||||||
|
"subq $8,%7;cmpq $8,%7;jnb 33109"#ndim"b;"\
|
||||||
|
"33101"#ndim":\n\t"\
|
||||||
|
"cmpq $4,%7;jb 33103"#ndim"f;"\
|
||||||
|
COMPUTE_m4(ndim)\
|
||||||
|
"subq $4,%7;"\
|
||||||
|
"33103"#ndim":\n\t"\
|
||||||
|
"cmpq $2,%7;jb 33104"#ndim"f;"\
|
||||||
|
COMPUTE_m2(ndim)\
|
||||||
|
"subq $2,%7;"\
|
||||||
|
"33104"#ndim":\n\t"\
|
||||||
|
"testq %7,%7;jz 33105"#ndim"f;"\
|
||||||
|
COMPUTE_m1(ndim)\
|
||||||
|
"33105"#ndim":\n\t"\
|
||||||
|
"movq %%r13,%4; movq %%r14,%1; movq %%r11,%7;"\
|
||||||
|
:"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(K),"+r"(ctemp),"+r"(const_val),"+r"(M),"+r"(next_b)\
|
||||||
|
::"r11","r12","r13","r14","r15",\
|
||||||
|
"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15","cc","memory");\
|
||||||
|
a_pointer -= M * K; b_pointer += ndim * K; c_pointer += 2*(LDC * ndim - M);\
|
||||||
|
}
|
||||||
|
|
||||||
|
int __attribute__ ((noinline))
|
||||||
|
CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alphar, float alphai, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG LDC)
|
||||||
|
{
|
||||||
|
if(m==0||n==0||k==0) return 0;
|
||||||
|
int64_t ldc_in_bytes = (int64_t)LDC * sizeof(float) * 2;
|
||||||
|
float constval[2]; constval[0] = alphar; constval[1] = alphai;
|
||||||
|
float *const_val=constval;
|
||||||
|
int64_t M = (int64_t)m, K = (int64_t)k;
|
||||||
|
BLASLONG n_count = n;
|
||||||
|
float *a_pointer = A,*b_pointer = B,*c_pointer = C,*ctemp = C,*next_b = B;
|
||||||
|
for(;n_count>11;n_count-=12) COMPUTE(12)
|
||||||
|
for(;n_count>7;n_count-=8) COMPUTE(8)
|
||||||
|
for(;n_count>3;n_count-=4) COMPUTE(4)
|
||||||
|
for(;n_count>1;n_count-=2) COMPUTE(2)
|
||||||
|
if(n_count>0) COMPUTE(1)
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -0,0 +1,292 @@
|
||||||
|
#include "common.h"
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
/* recommended settings: GEMM_P = 256, GEMM_Q = 256 */
|
||||||
|
|
||||||
|
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||||
|
#define A_CONJ 0
|
||||||
|
#define B_CONJ 0
|
||||||
|
#endif
|
||||||
|
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||||
|
#define A_CONJ 1
|
||||||
|
#define B_CONJ 0
|
||||||
|
#endif
|
||||||
|
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||||
|
#define A_CONJ 0
|
||||||
|
#define B_CONJ 1
|
||||||
|
#endif
|
||||||
|
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||||
|
#define A_CONJ 1
|
||||||
|
#define B_CONJ 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc(bytes), %5 = k_counter, %6 = &alpha, %7 = m_counter, %8 = b_pref */
|
||||||
|
/* r11 = m, r12 = k << 4, r13 = k, r14 = b_head, r15 = temp */
|
||||||
|
|
||||||
|
/* m=8, ymm 0-3 temp, ymm 4-15 acc */
|
||||||
|
#if A_CONJ == B_CONJ
|
||||||
|
#define acc_m4n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%ymm"#ar",%%ymm"#b2",%%ymm"#cl"; vfmadd231ps %%ymm"#ai",%%ymm"#b2",%%ymm"#cr";"
|
||||||
|
#define acc_m8n1_con(ua,la,b1,uc,lc) "vfmaddsub231ps %%ymm"#ua",%%ymm"#b1",%%ymm"#uc"; vfmaddsub231ps %%ymm"#la",%%ymm"#b1",%%ymm"#lc";"
|
||||||
|
#else
|
||||||
|
#define acc_m4n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%ymm"#ar",%%ymm"#b2",%%ymm"#cl"; vfnmadd231ps %%ymm"#ai",%%ymm"#b2",%%ymm"#cr";"
|
||||||
|
#define acc_m8n1_con(ua,la,b1,uc,lc) "vfmsubadd231ps %%ymm"#ua",%%ymm"#b1",%%ymm"#uc"; vfmsubadd231ps %%ymm"#la",%%ymm"#b1",%%ymm"#lc";"
|
||||||
|
#endif
|
||||||
|
/* expanded accumulators for m8n1 and m8n2 */
|
||||||
|
#define KERNEL_k1m8n1 \
|
||||||
|
"vbroadcastsd (%1),%%ymm0; addq $8,%1;"\
|
||||||
|
"vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2;" acc_m4n1_exp(1,2,0,4,5)\
|
||||||
|
"vmovsldup 32(%0),%%ymm1; vmovshdup 32(%0),%%ymm2;" acc_m4n1_exp(1,2,0,6,7)\
|
||||||
|
"addq $64,%0;"
|
||||||
|
#define KERNEL_k1m8n2 \
|
||||||
|
"vbroadcastsd (%1),%%ymm0; vbroadcastsd 8(%1),%%ymm1; addq $16,%1;"\
|
||||||
|
"vmovsldup (%0),%%ymm2; vmovshdup (%0),%%ymm3;" acc_m4n1_exp(2,3,0,4,5) acc_m4n1_exp(2,3,1,8,9)\
|
||||||
|
"vmovsldup 32(%0),%%ymm2; vmovshdup 32(%0),%%ymm3;" acc_m4n1_exp(2,3,0,6,7) acc_m4n1_exp(2,3,1,10,11)\
|
||||||
|
"addq $64,%0;"
|
||||||
|
/* contracted accumulators for m8n4 and m8n6 */
|
||||||
|
#define acc_m8n2_con(ua,la,luc,llc,ruc,rlc,lboff,rboff,...) \
|
||||||
|
"vbroadcastss "#lboff"("#__VA_ARGS__"),%%ymm2;" acc_m8n1_con(ua,la,2,luc,llc)\
|
||||||
|
"vbroadcastss "#rboff"("#__VA_ARGS__"),%%ymm3;" acc_m8n1_con(ua,la,3,ruc,rlc)
|
||||||
|
#define KERNEL_1_k1m8n4 \
|
||||||
|
"vmovups (%0),%%ymm0; vmovups 32(%0),%%ymm1; prefetcht0 512(%0); addq $64,%0;"\
|
||||||
|
acc_m8n2_con(0,1,4,5,6,7,0,8,%1) acc_m8n2_con(0,1,8,9,10,11,0,8,%1,%%r12,1)
|
||||||
|
#define KERNEL_2_k1m8n4 \
|
||||||
|
"vpermilps $177,%%ymm0,%%ymm0; vpermilps $177,%%ymm1,%%ymm1;"\
|
||||||
|
acc_m8n2_con(0,1,4,5,6,7,4,12,%1) acc_m8n2_con(0,1,8,9,10,11,4,12,%1,%%r12,1)
|
||||||
|
#define KERNEL_1_k1m8n6 KERNEL_1_k1m8n4 acc_m8n2_con(0,1,12,13,14,15,0,8,%1,%%r12,2)
|
||||||
|
#define KERNEL_2_k1m8n6 KERNEL_2_k1m8n4 acc_m8n2_con(0,1,12,13,14,15,4,12,%1,%%r12,2)
|
||||||
|
#define KERNEL_k1m8n4 KERNEL_1_k1m8n4 KERNEL_2_k1m8n4 "addq $16,%1;"
|
||||||
|
#define KERNEL_k1m8n6 KERNEL_1_k1m8n6 KERNEL_2_k1m8n6 "addq $16,%1;"
|
||||||
|
#define zero_4ymm(no1,no2,no3,no4) \
|
||||||
|
"vpxor %%ymm"#no1",%%ymm"#no1",%%ymm"#no1"; vpxor %%ymm"#no2",%%ymm"#no2",%%ymm"#no2";"\
|
||||||
|
"vpxor %%ymm"#no3",%%ymm"#no3",%%ymm"#no3"; vpxor %%ymm"#no4",%%ymm"#no4",%%ymm"#no4";"
|
||||||
|
/* initialization and storage macros */
|
||||||
|
#define INIT_m8n1 zero_4ymm(4,5,6,7)
|
||||||
|
#define INIT_m8n2 zero_4ymm(4,5,6,7) zero_4ymm(8,9,10,11)
|
||||||
|
#define INIT_m8n4 zero_4ymm(4,5,6,7) zero_4ymm(8,9,10,11)
|
||||||
|
#define INIT_m8n6 INIT_m8n4 zero_4ymm(12,13,14,15)
|
||||||
|
#if A_CONJ == B_CONJ
|
||||||
|
#define cont_expacc(cl,cr,dst) "vpermilps $177,%%ymm"#cr",%%ymm"#cr"; vaddsubps %%ymm"#cl",%%ymm"#cr",%%ymm"#dst";"
|
||||||
|
#else
|
||||||
|
#define cont_expacc(cl,cr,dst) "vpermilps $177,%%ymm"#cr",%%ymm"#cr"; vaddsubps %%ymm"#cr",%%ymm"#cl",%%ymm"#dst";"
|
||||||
|
#endif
|
||||||
|
#if A_CONJ == 0
|
||||||
|
#define save_1ymm(c,tmp,off,alpr,alpi,...) \
|
||||||
|
"vpermilps $177,%%ymm"#c",%%ymm"#tmp"; vfmsubadd213ps "#off"("#__VA_ARGS__"),%%ymm"#alpr",%%ymm"#c";"\
|
||||||
|
"vfmsubadd231ps %%ymm"#tmp",%%ymm"#alpi",%%ymm"#c"; vmovups %%ymm"#c","#off"("#__VA_ARGS__");"
|
||||||
|
#else
|
||||||
|
#define save_1ymm(c,tmp,off,alpr,alpi,...) \
|
||||||
|
"vpermilps $177,%%ymm"#c",%%ymm"#tmp"; vfmaddsub213ps "#off"("#__VA_ARGS__"),%%ymm"#alpi",%%ymm"#tmp";"\
|
||||||
|
"vfmaddsub231ps %%ymm"#c",%%ymm"#alpr",%%ymm"#tmp"; vmovups %%ymm"#tmp","#off"("#__VA_ARGS__");"
|
||||||
|
#endif
|
||||||
|
#define save_init_m8 "movq %2,%3; addq $64,%2; vbroadcastss (%6),%%ymm0; vbroadcastss 4(%6),%%ymm1;"
|
||||||
|
#define SAVE_m8n1 save_init_m8 cont_expacc(4,5,4) cont_expacc(6,7,6) save_1ymm(4,2,0,0,1,%3) save_1ymm(6,3,32,0,1,%3)
|
||||||
|
#define SAVE_m8n2 SAVE_m8n1\
|
||||||
|
cont_expacc(8,9,8) cont_expacc(10,11,10) save_1ymm(8,2,0,0,1,%3,%4,1) save_1ymm(10,3,32,0,1,%3,%4,1)
|
||||||
|
#define SAVE_m8n4 save_init_m8\
|
||||||
|
save_1ymm(4,2,0,0,1,%3) save_1ymm(5,3,32,0,1,%3) save_1ymm(6,2,0,0,1,%3,%4,1) save_1ymm(7,3,32,0,1,%3,%4,1) "leaq (%3,%4,2),%3;"\
|
||||||
|
save_1ymm(8,2,0,0,1,%3) save_1ymm(9,3,32,0,1,%3) save_1ymm(10,2,0,0,1,%3,%4,1) save_1ymm(11,3,32,0,1,%3,%4,1)
|
||||||
|
#define SAVE_m8n6 SAVE_m8n4 "leaq (%3,%4,2),%3;"\
|
||||||
|
save_1ymm(12,2,0,0,1,%3) save_1ymm(13,3,32,0,1,%3) save_1ymm(14,2,0,0,1,%3,%4,1) save_1ymm(15,3,32,0,1,%3,%4,1)
|
||||||
|
#define COMPUTE_m8(ndim) \
|
||||||
|
"movq %%r14,%1;" INIT_m8n##ndim "movq %2,%3; movq %%r13,%5;"\
|
||||||
|
"testq %5,%5; jz "#ndim"8883f; cmpq $10,%5; jb "#ndim"8882f;"\
|
||||||
|
"movq $10,%5; movq $84,%%r15;"\
|
||||||
|
#ndim"8881:\n\t"\
|
||||||
|
"prefetcht1 (%3); subq $63,%3; addq %%r15,%3;"\
|
||||||
|
"prefetcht0 64(%1); prefetcht0 64(%1,%%r12,1); prefetcht0 64(%1,%%r12,2);"\
|
||||||
|
KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\
|
||||||
|
"testq $12,%5; movq $84,%%r15; cmovz %4,%%r15; prefetcht1 (%8); addq $16,%8;"\
|
||||||
|
KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\
|
||||||
|
"addq $4,%5; cmpq %5,%%r13; jnb "#ndim"8881b;"\
|
||||||
|
"movq %2,%3; negq %5; leaq 10(%%r13,%5,1),%5; prefetcht0 (%6); prefetcht0 7(%6);"\
|
||||||
|
#ndim"8882:\n\t"\
|
||||||
|
"prefetcht0 (%3); prefetcht0 63(%3); addq %4,%3;"\
|
||||||
|
KERNEL_k1m8n##ndim "decq %5; jnz "#ndim"8882b;"\
|
||||||
|
#ndim"8883:\n\t"\
|
||||||
|
"prefetcht0 (%%r14); prefetcht0 64(%%r14);" SAVE_m8n##ndim
|
||||||
|
|
||||||
|
/* m=4, ymm 0-3 temp, ymm 4-15 acc, expanded accumulators */
|
||||||
|
#define KERNEL_k1m4n1 \
|
||||||
|
"vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; addq $32,%0;"\
|
||||||
|
"vbroadcastsd (%1),%%ymm0;" acc_m4n1_exp(1,2,0,4,5) "addq $8,%1;"
|
||||||
|
#define acc_m4n2_exp(c1l,c1r,c2l,c2r,...) \
|
||||||
|
"vbroadcastsd ("#__VA_ARGS__"),%%ymm2;" acc_m4n1_exp(0,1,2,c1l,c1r)\
|
||||||
|
"vbroadcastsd 8("#__VA_ARGS__"),%%ymm3;" acc_m4n1_exp(0,1,3,c2l,c2r)
|
||||||
|
#define KERNEL_h_k1m4n2 \
|
||||||
|
"vmovsldup (%0),%%ymm0; vmovshdup (%0),%%ymm1; addq $32,%0;" acc_m4n2_exp(4,5,6,7,%1)
|
||||||
|
#define KERNEL_h_k1m4n4 KERNEL_h_k1m4n2 acc_m4n2_exp(8,9,10,11,%1,%%r12,1)
|
||||||
|
#define KERNEL_h_k1m4n6 KERNEL_h_k1m4n4 acc_m4n2_exp(12,13,14,15,%1,%%r12,2)
|
||||||
|
#define KERNEL_k1m4n2 KERNEL_h_k1m4n2 "addq $16,%1;"
|
||||||
|
#define KERNEL_k1m4n4 KERNEL_h_k1m4n4 "addq $16,%1;"
|
||||||
|
#define KERNEL_k1m4n6 KERNEL_h_k1m4n6 "addq $16,%1;"
|
||||||
|
#define INIT_m4n1 "vpxor %%ymm4,%%ymm4,%%ymm4; vpxor %%ymm5,%%ymm5,%%ymm5;"
|
||||||
|
#define INIT_m4n2 zero_4ymm(4,5,6,7)
|
||||||
|
#define INIT_m4n4 INIT_m4n2 zero_4ymm(8,9,10,11)
|
||||||
|
#define INIT_m4n6 INIT_m4n4 zero_4ymm(12,13,14,15)
|
||||||
|
#define save_init_m4 "movq %2,%3; addq $32,%2; vbroadcastss (%6),%%ymm0; vbroadcastss 4(%6),%%ymm1;"
|
||||||
|
#define SAVE_m4n1 save_init_m4 cont_expacc(4,5,4) save_1ymm(4,2,0,0,1,%3)
|
||||||
|
#define SAVE_m4n2 SAVE_m4n1 cont_expacc(6,7,6) save_1ymm(6,3,0,0,1,%3,%4,1)
|
||||||
|
#define SAVE_m4n4 SAVE_m4n2 "leaq (%3,%4,2),%3;"\
|
||||||
|
cont_expacc(8,9,8) cont_expacc(10,11,10) save_1ymm(8,2,0,0,1,%3) save_1ymm(10,3,0,0,1,%3,%4,1)
|
||||||
|
#define SAVE_m4n6 SAVE_m4n4 "leaq (%3,%4,2),%3;"\
|
||||||
|
cont_expacc(12,13,12) cont_expacc(14,15,14) save_1ymm(12,2,0,0,1,%3) save_1ymm(14,3,0,0,1,%3,%4,1)
|
||||||
|
#define COMPUTE_m4(ndim) \
|
||||||
|
"movq %%r14,%1;" INIT_m4n##ndim "movq %%r13,%5;"\
|
||||||
|
"testq %5,%5; jz "#ndim"4442f;"\
|
||||||
|
#ndim"4441:\n\t"\
|
||||||
|
KERNEL_k1m4n##ndim\
|
||||||
|
"decq %5; jnz "#ndim"4441b;"\
|
||||||
|
#ndim"4442:\n\t"\
|
||||||
|
SAVE_m4n##ndim
|
||||||
|
|
||||||
|
/* m=2, xmm 0-3 temp, xmm 4-15 acc, expanded accumulators */
|
||||||
|
#if A_CONJ == B_CONJ
|
||||||
|
#define acc_m2n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";"
|
||||||
|
#else
|
||||||
|
#define acc_m2n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfnmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";"
|
||||||
|
#endif
|
||||||
|
#define KERNEL_h_k1m2n1 \
|
||||||
|
"vmovsldup (%0),%%xmm0; vmovshdup (%0),%%xmm1; addq $16,%0;"\
|
||||||
|
"vmovddup (%1),%%xmm2;" acc_m2n1_exp(0,1,2,4,5)
|
||||||
|
#define KERNEL_h_k1m2n2 KERNEL_h_k1m2n1\
|
||||||
|
"vmovddup 8(%1),%%xmm3;" acc_m2n1_exp(0,1,3,6,7)
|
||||||
|
#define acc_m2n2_exp(c1,c2,c3,c4,...)\
|
||||||
|
"vmovddup ("#__VA_ARGS__"),%%xmm2;" acc_m2n1_exp(0,1,2,c1,c2)\
|
||||||
|
"vmovddup 8("#__VA_ARGS__"),%%xmm3;" acc_m2n1_exp(0,1,3,c3,c4)
|
||||||
|
#define KERNEL_h_k1m2n4 KERNEL_h_k1m2n2 acc_m2n2_exp(8,9,10,11,%1,%%r12,1)
|
||||||
|
#define KERNEL_h_k1m2n6 KERNEL_h_k1m2n4 acc_m2n2_exp(12,13,14,15,%1,%%r12,2)
|
||||||
|
#define KERNEL_k1m2n1 KERNEL_h_k1m2n1 "addq $8,%1;"
|
||||||
|
#define KERNEL_k1m2n2 KERNEL_h_k1m2n2 "addq $16,%1;"
|
||||||
|
#define KERNEL_k1m2n4 KERNEL_h_k1m2n4 "addq $16,%1;"
|
||||||
|
#define KERNEL_k1m2n6 KERNEL_h_k1m2n6 "addq $16,%1;"
|
||||||
|
#define zero_2xmm(no1,no2) "vpxor %%xmm"#no1",%%xmm"#no1",%%xmm"#no1"; vpxor %%xmm"#no2",%%xmm"#no2",%%xmm"#no2";"
|
||||||
|
#define INIT_m2n1 zero_2xmm(4,5)
|
||||||
|
#define INIT_m2n2 INIT_m2n1 zero_2xmm(6,7)
|
||||||
|
#define INIT_m2n4 INIT_m2n2 zero_2xmm(8,9) zero_2xmm(10,11)
|
||||||
|
#define INIT_m2n6 INIT_m2n4 zero_2xmm(12,13) zero_2xmm(14,15)
|
||||||
|
#if A_CONJ == B_CONJ
|
||||||
|
#define cont_expxmmacc(cl,cr,dst) "vpermilps $177,%%xmm"#cr",%%xmm"#cr"; vaddsubps %%xmm"#cl",%%xmm"#cr",%%xmm"#dst";"
|
||||||
|
#else
|
||||||
|
#define cont_expxmmacc(cl,cr,dst) "vpermilps $177,%%xmm"#cr",%%xmm"#cr"; vaddsubps %%xmm"#cr",%%xmm"#cl",%%xmm"#dst";"
|
||||||
|
#endif
|
||||||
|
#if A_CONJ == 0
|
||||||
|
#define save_1xmm(c,tmp,alpr,alpi) \
|
||||||
|
"vpermilps $177,%%xmm"#c",%%xmm"#tmp"; vfmsubadd213ps (%3),%%xmm"#alpr",%%xmm"#c";"\
|
||||||
|
"vfmsubadd231ps %%xmm"#tmp",%%xmm"#alpi",%%xmm"#c"; vmovups %%xmm"#c",(%3); addq %4,%3;"
|
||||||
|
#else
|
||||||
|
#define save_1xmm(c,tmp,alpr,alpi) \
|
||||||
|
"vpermilps $177,%%xmm"#c",%%xmm"#tmp"; vfmaddsub213ps (%3),%%xmm"#alpi",%%xmm"#tmp";"\
|
||||||
|
"vfmaddsub231ps %%xmm"#c",%%xmm"#alpr",%%xmm"#tmp"; vmovups %%xmm"#tmp",(%3); addq %4,%3;"
|
||||||
|
#endif
|
||||||
|
#define save_init_m2 "movq %2,%3; addq $16,%2; vbroadcastss (%6),%%xmm0; vbroadcastss 4(%6),%%xmm1;"
|
||||||
|
#define SAVE_m2n1 save_init_m2 cont_expxmmacc(4,5,4) save_1xmm(4,2,0,1)
|
||||||
|
#define SAVE_m2n2 SAVE_m2n1 cont_expacc(6,7,6) save_1xmm(6,3,0,1)
|
||||||
|
#define SAVE_m2n4 SAVE_m2n2 cont_expacc(8,9,8) save_1xmm(8,2,0,1) cont_expacc(10,11,10) save_1xmm(10,3,0,1)
|
||||||
|
#define SAVE_m2n6 SAVE_m2n4 cont_expacc(12,13,12) save_1xmm(12,2,0,1) cont_expacc(14,15,14) save_1xmm(14,3,0,1)
|
||||||
|
#define COMPUTE_m2(ndim) \
|
||||||
|
"movq %%r14,%1;" INIT_m2n##ndim "movq %%r13,%5;"\
|
||||||
|
"testq %5,%5; jz "#ndim"2222f;"\
|
||||||
|
#ndim"2221:\n\t"\
|
||||||
|
KERNEL_k1m2n##ndim\
|
||||||
|
"decq %5; jnz "#ndim"2221b;"\
|
||||||
|
#ndim"2222:\n\t"\
|
||||||
|
SAVE_m2n##ndim
|
||||||
|
|
||||||
|
/* m=1, xmm 0-3 temp, xmm 4-9 acc, expanded accumulators */
|
||||||
|
#if A_CONJ == B_CONJ
|
||||||
|
#define acc_m1n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";"
|
||||||
|
#define acc_m1n2_exp(arb,aib,b4,cl,cr) "vfmadd231ps %%xmm"#arb",%%xmm"#b4",%%xmm"#cl"; vfmadd231ps %%xmm"#aib",%%xmm"#b4",%%xmm"#cr";"
|
||||||
|
#else
|
||||||
|
#define acc_m1n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfnmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";"
|
||||||
|
#define acc_m1n2_exp(arb,aib,b4,cl,cr) "vfmadd231ps %%xmm"#arb",%%xmm"#b4",%%xmm"#cl"; vfnmadd231ps %%xmm"#aib",%%xmm"#b4",%%xmm"#cr";"
|
||||||
|
#endif
|
||||||
|
#define KERNEL_k1m1n1 \
|
||||||
|
"vbroadcastss (%0),%%xmm0; vbroadcastss 4(%0),%%xmm1; addq $8,%0;"\
|
||||||
|
"vmovsd (%1),%%xmm2; addq $8,%1;" acc_m1n1_exp(0,1,2,4,5)
|
||||||
|
#define KERNEL_h_k1m1n2 \
|
||||||
|
"vbroadcastss (%0),%%xmm0; vbroadcastss 4(%0),%%xmm1; addq $8,%0;"\
|
||||||
|
"vmovups (%1),%%xmm2;" acc_m1n2_exp(0,1,2,4,5)
|
||||||
|
#define KERNEL_h_k1m1n4 KERNEL_h_k1m1n2 "vmovups (%1,%%r12,1),%%xmm2;" acc_m1n2_exp(0,1,2,6,7)
|
||||||
|
#define KERNEL_h_k1m1n6 KERNEL_h_k1m1n4 "vmovups (%1,%%r12,2),%%xmm2;" acc_m1n2_exp(0,1,2,8,9)
|
||||||
|
#define KERNEL_k1m1n2 KERNEL_h_k1m1n2 "addq $16,%1;"
|
||||||
|
#define KERNEL_k1m1n4 KERNEL_h_k1m1n4 "addq $16,%1;"
|
||||||
|
#define KERNEL_k1m1n6 KERNEL_h_k1m1n6 "addq $16,%1;"
|
||||||
|
#define INIT_m1n1 zero_2xmm(4,5)
|
||||||
|
#define INIT_m1n2 zero_2xmm(4,5)
|
||||||
|
#define INIT_m1n4 INIT_m1n2 zero_2xmm(6,7)
|
||||||
|
#define INIT_m1n6 INIT_m1n4 zero_2xmm(8,9)
|
||||||
|
#if A_CONJ == 0
|
||||||
|
#define save_m1n1(c,tmp1,tmp2,alpr,alpi) \
|
||||||
|
"vpermilps $177,%%xmm"#c",%%xmm"#tmp1"; vmovsd (%3),%%xmm"#tmp2"; vfmsubadd213ps %%xmm"#tmp2",%%xmm"#alpr",%%xmm"#c";"\
|
||||||
|
"vfmsubadd231ps %%xmm"#tmp1",%%xmm"#alpi",%%xmm"#c"; vmovsd %%xmm"#c",(%3);"
|
||||||
|
#define save_m1n2(c,tmp1,tmp2,alpr,alpi) \
|
||||||
|
"vpermilps $177,%%xmm"#c",%%xmm"#tmp1"; vmovsd (%3),%%xmm"#tmp2"; vmovhpd (%3,%4,1),%%xmm"#tmp2",%%xmm"#tmp2";"\
|
||||||
|
"vfmsubadd213ps %%xmm"#tmp2",%%xmm"#alpr",%%xmm"#c"; vfmsubadd231ps %%xmm"#tmp1",%%xmm"#alpi",%%xmm"#c";"\
|
||||||
|
"vmovsd %%xmm"#c",(%3); vmovhpd %%xmm"#c",(%3,%4,1); leaq (%3,%4,2),%3;"
|
||||||
|
#else
|
||||||
|
#define save_m1n1(c,tmp1,tmp2,alpr,alpi) \
|
||||||
|
"vpermilps $177,%%xmm"#c",%%xmm"#tmp1"; vmovsd (%3),%%xmm"#tmp2"; vfmaddsub213ps %%xmm"#tmp2",%%xmm"#alpi",%%xmm"#tmp1";"\
|
||||||
|
"vfmaddsub231ps %%xmm"#c",%%xmm"#alpr",%%xmm"#tmp1"; vmovsd %%xmm"#tmp1",(%3);"
|
||||||
|
#define save_m1n2(c,tmp1,tmp2,alpr,alpi) \
|
||||||
|
"vpermilps $177,%%xmm"#c",%%xmm"#tmp1"; vmovsd (%3),%%xmm"#tmp2"; vmovhpd (%3,%4,1),%%xmm"#tmp2",%%xmm"#tmp2";"\
|
||||||
|
"vfmaddsub213ps %%xmm"#tmp2",%%xmm"#alpi",%%xmm"#tmp1"; vfmaddsub231ps %%xmm"#c",%%xmm"#alpr",%%xmm"#tmp1";"\
|
||||||
|
"vmovsd %%xmm"#tmp1",(%3); vmovhpd %%xmm"#tmp1",(%3,%4,1); leaq (%3,%4,2),%3;"
|
||||||
|
#endif
|
||||||
|
#define save_init_m1 "movq %2,%3; addq $8,%2; vbroadcastss (%6),%%xmm0; vbroadcastss 4(%6),%%xmm1;"
|
||||||
|
#define SAVE_m1n1 save_init_m1 cont_expxmmacc(4,5,4) save_m1n1(4,2,3,0,1)
|
||||||
|
#define SAVE_m1n2 save_init_m1 cont_expxmmacc(4,5,4) save_m1n2(4,2,3,0,1)
|
||||||
|
#define SAVE_m1n4 SAVE_m1n2 cont_expxmmacc(6,7,6) save_m1n2(6,2,3,0,1)
|
||||||
|
#define SAVE_m1n6 SAVE_m1n4 cont_expxmmacc(8,9,8) save_m1n2(8,2,3,0,1)
|
||||||
|
#define COMPUTE_m1(ndim) \
|
||||||
|
"movq %%r14,%1;" INIT_m1n##ndim "movq %%r13,%5;"\
|
||||||
|
"testq %5,%5; jz "#ndim"1112f;"\
|
||||||
|
#ndim"1111:\n\t"\
|
||||||
|
KERNEL_k1m1n##ndim\
|
||||||
|
"decq %5; jnz "#ndim"1111b;"\
|
||||||
|
#ndim"1112:\n\t"\
|
||||||
|
SAVE_m1n##ndim
|
||||||
|
|
||||||
|
#define COMPUTE(ndim) {\
|
||||||
|
b_pref = b_ptr + ndim * K *2;\
|
||||||
|
__asm__ __volatile__ (\
|
||||||
|
"movq %1,%%r14; movq %5,%%r13; movq %5,%%r12; salq $4,%%r12; movq %7,%%r11;"\
|
||||||
|
"cmpq $8,%7; jb "#ndim"9992f;"\
|
||||||
|
#ndim"9991:\n\t"\
|
||||||
|
COMPUTE_m8(ndim)\
|
||||||
|
"subq $8,%7; cmpq $8,%7; jnb "#ndim"9991b;"\
|
||||||
|
#ndim"9992:\n\t"\
|
||||||
|
"cmpq $4,%7; jb "#ndim"9993f;"\
|
||||||
|
COMPUTE_m4(ndim) "subq $4,%7;"\
|
||||||
|
#ndim"9993:\n\t"\
|
||||||
|
"cmpq $2,%7; jb "#ndim"9994f;"\
|
||||||
|
COMPUTE_m2(ndim) "subq $2,%7;"\
|
||||||
|
#ndim"9994:\n\t"\
|
||||||
|
"testq %7,%7; jz "#ndim"9995f;"\
|
||||||
|
COMPUTE_m1(ndim)\
|
||||||
|
#ndim"9995:\n\t"\
|
||||||
|
"movq %%r14,%1; movq %%r13,%5; movq %%r11,%7; vzeroupper;"\
|
||||||
|
:"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_in_bytes),"+r"(K),"+r"(alp),"+r"(M),"+r"(b_pref)\
|
||||||
|
::"cc","memory","r11","r12","r13","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5",\
|
||||||
|
"xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\
|
||||||
|
a_ptr -= M * K *2; b_ptr += ndim * K *2; c_ptr += (ndim * LDC - M) * 2;\
|
||||||
|
}
|
||||||
|
|
||||||
|
int __attribute__ ((noinline))
|
||||||
|
CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alphar, float alphai, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG LDC)
|
||||||
|
{
|
||||||
|
if(m==0||n==0||k==0||(alphar==0.0 && alphai==0.0)) return 0;
|
||||||
|
int64_t ldc_in_bytes = (int64_t)LDC * sizeof(float) * 2;
|
||||||
|
#if A_CONJ == B_CONJ
|
||||||
|
float const_val[2] = {-alphar, -alphai};
|
||||||
|
#else
|
||||||
|
float const_val[2] = {alphar, alphai};
|
||||||
|
#endif
|
||||||
|
int64_t M = (int64_t)m, K = (int64_t)k;
|
||||||
|
BLASLONG n_count = n;
|
||||||
|
float *a_ptr = A,*b_ptr = B,*c_ptr = C,*c_tmp = C,*alp = const_val,*b_pref = B;
|
||||||
|
for(;n_count>5;n_count-=6) COMPUTE(6)
|
||||||
|
for(;n_count>3;n_count-=4) COMPUTE(4)
|
||||||
|
for(;n_count>1;n_count-=2) COMPUTE(2)
|
||||||
|
if(n_count>0) COMPUTE(1)
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -0,0 +1,240 @@
|
||||||
|
#include "common.h"
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
/* recommended settings: GEMM_P = 192, GEMM_Q = 192 */
|
||||||
|
|
||||||
|
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||||
|
#define A_CONJ 0
|
||||||
|
#define B_CONJ 0
|
||||||
|
#endif
|
||||||
|
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||||
|
#define A_CONJ 1
|
||||||
|
#define B_CONJ 0
|
||||||
|
#endif
|
||||||
|
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||||
|
#define A_CONJ 0
|
||||||
|
#define B_CONJ 1
|
||||||
|
#endif
|
||||||
|
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||||
|
#define A_CONJ 1
|
||||||
|
#define B_CONJ 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc(bytes), %5 = k_counter, %6 = &alpha, %7 = m_counter, %8 = b_pref */
|
||||||
|
/* r11 = m, r12 = k << 5, r13 = k, r14 = b_head, r15 = temp */
|
||||||
|
|
||||||
|
/* m=4, ymm 0-3 temp, ymm 4-15 acc */
|
||||||
|
#if A_CONJ == B_CONJ
|
||||||
|
#define acc_m2n1_exp(ar,ai,b2,cl,cr) "vfmadd231pd %%ymm"#ar",%%ymm"#b2",%%ymm"#cl"; vfmadd231pd %%ymm"#ai",%%ymm"#b2",%%ymm"#cr";"
|
||||||
|
#define acc_m4n1_con(ua,la,b1,uc,lc) "vfmaddsub231pd %%ymm"#ua",%%ymm"#b1",%%ymm"#uc"; vfmaddsub231pd %%ymm"#la",%%ymm"#b1",%%ymm"#lc";"
|
||||||
|
#else
|
||||||
|
#define acc_m2n1_exp(ar,ai,b2,cl,cr) "vfmadd231pd %%ymm"#ar",%%ymm"#b2",%%ymm"#cl"; vfnmadd231pd %%ymm"#ai",%%ymm"#b2",%%ymm"#cr";"
|
||||||
|
#define acc_m4n1_con(ua,la,b1,uc,lc) "vfmsubadd231pd %%ymm"#ua",%%ymm"#b1",%%ymm"#uc"; vfmsubadd231pd %%ymm"#la",%%ymm"#b1",%%ymm"#lc";"
|
||||||
|
#endif
|
||||||
|
/* expanded accumulators for m4n1 and m4n2 */
|
||||||
|
#define KERNEL_k1m4n1 \
|
||||||
|
"vbroadcastf128 (%1),%%ymm0; addq $16,%1;"\
|
||||||
|
"vmovddup (%0),%%ymm1; vmovddup 8(%0),%%ymm2;" acc_m2n1_exp(1,2,0,4,5)\
|
||||||
|
"vmovddup 32(%0),%%ymm1; vmovddup 40(%0),%%ymm2;" acc_m2n1_exp(1,2,0,6,7)\
|
||||||
|
"addq $64,%0;"
|
||||||
|
#define KERNEL_k1m4n2 \
|
||||||
|
"vbroadcastf128 (%1),%%ymm0; vbroadcastf128 16(%1),%%ymm1; addq $32,%1;"\
|
||||||
|
"vmovddup (%0),%%ymm2; vmovddup 8(%0),%%ymm3;" acc_m2n1_exp(2,3,0,4,5) acc_m2n1_exp(2,3,1,8,9)\
|
||||||
|
"vmovddup 32(%0),%%ymm2; vmovddup 40(%0),%%ymm3;" acc_m2n1_exp(2,3,0,6,7) acc_m2n1_exp(2,3,1,10,11)\
|
||||||
|
"addq $64,%0;"
|
||||||
|
/* contracted accumulators for m4n4 and m4n6 */
|
||||||
|
#define acc_m4n2_con(ua,la,luc,llc,ruc,rlc,lboff,rboff,...) \
|
||||||
|
"vbroadcastsd "#lboff"("#__VA_ARGS__"),%%ymm2;" acc_m4n1_con(ua,la,2,luc,llc)\
|
||||||
|
"vbroadcastsd "#rboff"("#__VA_ARGS__"),%%ymm3;" acc_m4n1_con(ua,la,3,ruc,rlc)
|
||||||
|
#define KERNEL_1_k1m4n4 \
|
||||||
|
"vmovupd (%0),%%ymm0; vmovupd 32(%0),%%ymm1; prefetcht0 512(%0); addq $64,%0;"\
|
||||||
|
acc_m4n2_con(0,1,4,5,6,7,0,16,%1) acc_m4n2_con(0,1,8,9,10,11,0,16,%1,%%r12,1)
|
||||||
|
#define KERNEL_2_k1m4n4 \
|
||||||
|
"vpermilpd $5,%%ymm0,%%ymm0; vpermilpd $5,%%ymm1,%%ymm1;"\
|
||||||
|
acc_m4n2_con(0,1,4,5,6,7,8,24,%1) acc_m4n2_con(0,1,8,9,10,11,8,24,%1,%%r12,1)
|
||||||
|
#define KERNEL_1_k1m4n6 KERNEL_1_k1m4n4 acc_m4n2_con(0,1,12,13,14,15,0,16,%1,%%r12,2)
|
||||||
|
#define KERNEL_2_k1m4n6 KERNEL_2_k1m4n4 acc_m4n2_con(0,1,12,13,14,15,8,24,%1,%%r12,2)
|
||||||
|
#define KERNEL_k1m4n4 KERNEL_1_k1m4n4 KERNEL_2_k1m4n4 "addq $32,%1;"
|
||||||
|
#define KERNEL_k1m4n6 KERNEL_1_k1m4n6 KERNEL_2_k1m4n6 "addq $32,%1;"
|
||||||
|
#define zero_4ymm(no1,no2,no3,no4) \
|
||||||
|
"vpxor %%ymm"#no1",%%ymm"#no1",%%ymm"#no1"; vpxor %%ymm"#no2",%%ymm"#no2",%%ymm"#no2";"\
|
||||||
|
"vpxor %%ymm"#no3",%%ymm"#no3",%%ymm"#no3"; vpxor %%ymm"#no4",%%ymm"#no4",%%ymm"#no4";"
|
||||||
|
/* initialization and storage macros */
|
||||||
|
#define INIT_m4n1 zero_4ymm(4,5,6,7)
|
||||||
|
#define INIT_m4n2 zero_4ymm(4,5,6,7) zero_4ymm(8,9,10,11)
|
||||||
|
#define INIT_m4n4 zero_4ymm(4,5,6,7) zero_4ymm(8,9,10,11)
|
||||||
|
#define INIT_m4n6 INIT_m4n4 zero_4ymm(12,13,14,15)
|
||||||
|
#if A_CONJ == B_CONJ
|
||||||
|
#define cont_expacc(cl,cr,dst) "vpermilpd $5,%%ymm"#cr",%%ymm"#cr"; vaddsubpd %%ymm"#cl",%%ymm"#cr",%%ymm"#dst";"
|
||||||
|
#else
|
||||||
|
#define cont_expacc(cl,cr,dst) "vpermilpd $5,%%ymm"#cr",%%ymm"#cr"; vaddsubpd %%ymm"#cr",%%ymm"#cl",%%ymm"#dst";"
|
||||||
|
#endif
|
||||||
|
#if A_CONJ == 0
|
||||||
|
#define save_1ymm(c,tmp,off,alpr,alpi,...) \
|
||||||
|
"vpermilpd $5,%%ymm"#c",%%ymm"#tmp"; vfmsubadd213pd "#off"("#__VA_ARGS__"),%%ymm"#alpr",%%ymm"#c";"\
|
||||||
|
"vfmsubadd231pd %%ymm"#tmp",%%ymm"#alpi",%%ymm"#c"; vmovupd %%ymm"#c","#off"("#__VA_ARGS__");"
|
||||||
|
#else
|
||||||
|
#define save_1ymm(c,tmp,off,alpr,alpi,...) \
|
||||||
|
"vpermilpd $5,%%ymm"#c",%%ymm"#tmp"; vfmaddsub213pd "#off"("#__VA_ARGS__"),%%ymm"#alpi",%%ymm"#tmp";"\
|
||||||
|
"vfmaddsub231pd %%ymm"#c",%%ymm"#alpr",%%ymm"#tmp"; vmovupd %%ymm"#tmp","#off"("#__VA_ARGS__");"
|
||||||
|
#endif
|
||||||
|
#define save_init_m4 "movq %2,%3; addq $64,%2; vbroadcastsd (%6),%%ymm0; vbroadcastsd 8(%6),%%ymm1;"
|
||||||
|
#define SAVE_m4n1 save_init_m4 cont_expacc(4,5,4) cont_expacc(6,7,6) save_1ymm(4,2,0,0,1,%3) save_1ymm(6,3,32,0,1,%3)
|
||||||
|
#define SAVE_m4n2 SAVE_m4n1\
|
||||||
|
cont_expacc(8,9,8) cont_expacc(10,11,10) save_1ymm(8,2,0,0,1,%3,%4,1) save_1ymm(10,3,32,0,1,%3,%4,1)
|
||||||
|
#define SAVE_m4n4 save_init_m4\
|
||||||
|
save_1ymm(4,2,0,0,1,%3) save_1ymm(5,3,32,0,1,%3) save_1ymm(6,2,0,0,1,%3,%4,1) save_1ymm(7,3,32,0,1,%3,%4,1) "leaq (%3,%4,2),%3;"\
|
||||||
|
save_1ymm(8,2,0,0,1,%3) save_1ymm(9,3,32,0,1,%3) save_1ymm(10,2,0,0,1,%3,%4,1) save_1ymm(11,3,32,0,1,%3,%4,1)
|
||||||
|
#define SAVE_m4n6 SAVE_m4n4 "leaq (%3,%4,2),%3;"\
|
||||||
|
save_1ymm(12,2,0,0,1,%3) save_1ymm(13,3,32,0,1,%3) save_1ymm(14,2,0,0,1,%3,%4,1) save_1ymm(15,3,32,0,1,%3,%4,1)
|
||||||
|
#define COMPUTE_m4(ndim) \
|
||||||
|
"movq %%r14,%1;" INIT_m4n##ndim "movq %2,%3; movq %%r13,%5;"\
|
||||||
|
"testq %5,%5; jz "#ndim"4443f; cmpq $10,%5; jb "#ndim"4442f;"\
|
||||||
|
"movq $10,%5; movq $84,%%r15;"\
|
||||||
|
#ndim"4441:\n\t"\
|
||||||
|
"prefetcht1 (%3); subq $63,%3; addq %%r15,%3;"\
|
||||||
|
"prefetcht0 96(%1); prefetcht0 96(%1,%%r12,1); prefetcht0 96(%1,%%r12,2);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\
|
||||||
|
"testq $12,%5; movq $84,%%r15; cmovz %4,%%r15; prefetcht1 (%8); addq $16,%8;"\
|
||||||
|
"prefetcht0 96(%1); prefetcht0 96(%1,%%r12,1); prefetcht0 96(%1,%%r12,2);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\
|
||||||
|
"addq $4,%5; cmpq %5,%%r13; jnb "#ndim"4441b;"\
|
||||||
|
"movq %2,%3; negq %5; leaq 10(%%r13,%5,1),%5; prefetcht0 (%6); prefetcht0 15(%6);"\
|
||||||
|
#ndim"4442:\n\t"\
|
||||||
|
"prefetcht0 (%3); prefetcht0 63(%3); addq %4,%3;"\
|
||||||
|
KERNEL_k1m4n##ndim "decq %5; jnz "#ndim"4442b;"\
|
||||||
|
#ndim"4443:\n\t"\
|
||||||
|
"prefetcht0 (%%r14); prefetcht0 64(%%r14);" SAVE_m4n##ndim
|
||||||
|
|
||||||
|
/* m=2, ymm 0-3 temp, ymm 4-15 acc, expanded accumulators */
|
||||||
|
#define KERNEL_k1m2n1 \
|
||||||
|
"vmovddup (%0),%%ymm1; vmovddup 8(%0),%%ymm2; addq $32,%0;"\
|
||||||
|
"vbroadcastf128 (%1),%%ymm0;" acc_m2n1_exp(1,2,0,4,5) "addq $16,%1;"
|
||||||
|
#define acc_m2n2_exp(c1l,c1r,c2l,c2r,...) \
|
||||||
|
"vbroadcastf128 ("#__VA_ARGS__"),%%ymm2;" acc_m2n1_exp(0,1,2,c1l,c1r)\
|
||||||
|
"vbroadcastf128 16("#__VA_ARGS__"),%%ymm3;" acc_m2n1_exp(0,1,3,c2l,c2r)
|
||||||
|
#define KERNEL_h_k1m2n2 \
|
||||||
|
"vmovddup (%0),%%ymm0; vmovddup 8(%0),%%ymm1; addq $32,%0;" acc_m2n2_exp(4,5,6,7,%1)
|
||||||
|
#define KERNEL_h_k1m2n4 KERNEL_h_k1m2n2 acc_m2n2_exp(8,9,10,11,%1,%%r12,1)
|
||||||
|
#define KERNEL_h_k1m2n6 KERNEL_h_k1m2n4 acc_m2n2_exp(12,13,14,15,%1,%%r12,2)
|
||||||
|
#define KERNEL_k1m2n2 KERNEL_h_k1m2n2 "addq $32,%1;"
|
||||||
|
#define KERNEL_k1m2n4 KERNEL_h_k1m2n4 "addq $32,%1;"
|
||||||
|
#define KERNEL_k1m2n6 KERNEL_h_k1m2n6 "addq $32,%1;"
|
||||||
|
#define INIT_m2n1 "vpxor %%ymm4,%%ymm4,%%ymm4; vpxor %%ymm5,%%ymm5,%%ymm5;"
|
||||||
|
#define INIT_m2n2 zero_4ymm(4,5,6,7)
|
||||||
|
#define INIT_m2n4 INIT_m2n2 zero_4ymm(8,9,10,11)
|
||||||
|
#define INIT_m2n6 INIT_m2n4 zero_4ymm(12,13,14,15)
|
||||||
|
#define save_init_m2 "movq %2,%3; addq $32,%2; vbroadcastsd (%6),%%ymm0; vbroadcastsd 8(%6),%%ymm1;"
|
||||||
|
#define SAVE_m2n1 save_init_m2 cont_expacc(4,5,4) save_1ymm(4,2,0,0,1,%3)
|
||||||
|
#define SAVE_m2n2 SAVE_m2n1 cont_expacc(6,7,6) save_1ymm(6,3,0,0,1,%3,%4,1)
|
||||||
|
#define SAVE_m2n4 SAVE_m2n2 "leaq (%3,%4,2),%3;"\
|
||||||
|
cont_expacc(8,9,8) cont_expacc(10,11,10) save_1ymm(8,2,0,0,1,%3) save_1ymm(10,3,0,0,1,%3,%4,1)
|
||||||
|
#define SAVE_m2n6 SAVE_m2n4 "leaq (%3,%4,2),%3;"\
|
||||||
|
cont_expacc(12,13,12) cont_expacc(14,15,14) save_1ymm(12,2,0,0,1,%3) save_1ymm(14,3,0,0,1,%3,%4,1)
|
||||||
|
#define COMPUTE_m2(ndim) \
|
||||||
|
"movq %%r14,%1;" INIT_m2n##ndim "movq %%r13,%5;"\
|
||||||
|
"testq %5,%5; jz "#ndim"2222f;"\
|
||||||
|
#ndim"2221:\n\t"\
|
||||||
|
KERNEL_k1m2n##ndim\
|
||||||
|
"decq %5; jnz "#ndim"2221b;"\
|
||||||
|
#ndim"2222:\n\t"\
|
||||||
|
SAVE_m2n##ndim
|
||||||
|
|
||||||
|
/* m=1, vmm 0-3 temp, vmm 4-15 acc, expanded accumulators */
|
||||||
|
#if A_CONJ == B_CONJ
|
||||||
|
#define acc_m1n1_exp(ar,ai,b2,cl,cr) "vfmadd231pd %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfmadd231pd %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";"
|
||||||
|
#define acc_m1n2_exp(arb,aib,b4,cl,cr) "vfmadd231pd %%ymm"#arb",%%ymm"#b4",%%ymm"#cl"; vfmadd231pd %%ymm"#aib",%%ymm"#b4",%%ymm"#cr";"
|
||||||
|
#else
|
||||||
|
#define acc_m1n1_exp(ar,ai,b2,cl,cr) "vfmadd231pd %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfnmadd231pd %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";"
|
||||||
|
#define acc_m1n2_exp(arb,aib,b4,cl,cr) "vfmadd231pd %%ymm"#arb",%%ymm"#b4",%%ymm"#cl"; vfnmadd231pd %%ymm"#aib",%%ymm"#b4",%%ymm"#cr";"
|
||||||
|
#endif
|
||||||
|
#define KERNEL_k1m1n1 \
|
||||||
|
"vmovddup (%0),%%xmm0; vmovddup 8(%0),%%xmm1; addq $16,%0;"\
|
||||||
|
"vmovupd (%1),%%xmm2; addq $16,%1;" acc_m1n1_exp(0,1,2,4,5)
|
||||||
|
#define KERNEL_h_k1m1n2 \
|
||||||
|
"vbroadcastsd (%0),%%ymm0; vbroadcastsd 8(%0),%%ymm1; addq $16,%0;"\
|
||||||
|
"vmovupd (%1),%%ymm2;" acc_m1n2_exp(0,1,2,4,5)
|
||||||
|
#define KERNEL_h_k1m1n4 KERNEL_h_k1m1n2 "vmovupd (%1,%%r12,1),%%ymm2;" acc_m1n2_exp(0,1,2,6,7)
|
||||||
|
#define KERNEL_h_k1m1n6 KERNEL_h_k1m1n4 "vmovupd (%1,%%r12,2),%%ymm2;" acc_m1n2_exp(0,1,2,8,9)
|
||||||
|
#define KERNEL_k1m1n2 KERNEL_h_k1m1n2 "addq $32,%1;"
|
||||||
|
#define KERNEL_k1m1n4 KERNEL_h_k1m1n4 "addq $32,%1;"
|
||||||
|
#define KERNEL_k1m1n6 KERNEL_h_k1m1n6 "addq $32,%1;"
|
||||||
|
#define INIT_m1n1 "vpxor %%xmm4,%%xmm4,%%xmm4; vpxor %%xmm5,%%xmm5,%%xmm5;"
|
||||||
|
#define INIT_m1n2 "vpxor %%ymm4,%%ymm4,%%ymm4; vpxor %%ymm5,%%ymm5,%%ymm5;"
|
||||||
|
#define INIT_m1n4 INIT_m1n2 "vpxor %%ymm6,%%ymm6,%%ymm6; vpxor %%ymm7,%%ymm7,%%ymm7;"
|
||||||
|
#define INIT_m1n6 INIT_m1n4 "vpxor %%ymm8,%%ymm8,%%ymm8; vpxor %%ymm9,%%ymm9,%%ymm9;"
|
||||||
|
#if A_CONJ == B_CONJ
|
||||||
|
#define cont_expxmmacc(cl,cr,dst) "vpermilpd $5,%%xmm"#cr",%%xmm"#cr"; vaddsubpd %%xmm"#cl",%%xmm"#cr",%%xmm"#dst";"
|
||||||
|
#else
|
||||||
|
#define cont_expxmmacc(cl,cr,dst) "vpermilpd $5,%%xmm"#cr",%%xmm"#cr"; vaddsubpd %%xmm"#cr",%%xmm"#cl",%%xmm"#dst";"
|
||||||
|
#endif
|
||||||
|
#if A_CONJ == 0
|
||||||
|
#define save_m1n1(c,tmp,alpr,alpi) \
|
||||||
|
"vpermilpd $5,%%xmm"#c",%%xmm"#tmp"; vfmsubadd213pd (%3),%%xmm"#alpr",%%xmm"#c";"\
|
||||||
|
"vfmsubadd231pd %%xmm"#tmp",%%xmm"#alpi",%%xmm"#c"; vmovupd %%xmm"#c",(%3);"
|
||||||
|
#define save_m1n2(c,tmp1,tmp2,alpr,alpi) \
|
||||||
|
"vpermilpd $5,%%ymm"#c",%%ymm"#tmp1"; vmovupd (%3),%%xmm"#tmp2"; vinsertf128 $1,(%3,%4,1),%%ymm"#tmp2",%%ymm"#tmp2";"\
|
||||||
|
"vfmsubadd213pd %%ymm"#tmp2",%%ymm"#alpr",%%ymm"#c"; vfmsubadd231pd %%ymm"#tmp1",%%ymm"#alpi",%%ymm"#c";"\
|
||||||
|
"vmovupd %%xmm"#c",(%3); vextractf128 $1,%%ymm"#c",(%3,%4,1); leaq (%3,%4,2),%3;"
|
||||||
|
#else
|
||||||
|
#define save_m1n1(c,tmp,alpr,alpi) \
|
||||||
|
"vpermilpd $5,%%xmm"#c",%%xmm"#tmp"; vfmaddsub213pd (%3),%%xmm"#alpi",%%xmm"#tmp";"\
|
||||||
|
"vfmaddsub231pd %%xmm"#c",%%xmm"#alpr",%%xmm"#tmp"; vmovupd %%xmm"#tmp",(%3);"
|
||||||
|
#define save_m1n2(c,tmp1,tmp2,alpr,alpi) \
|
||||||
|
"vpermilpd $5,%%ymm"#c",%%ymm"#tmp1"; vmovupd (%3),%%xmm"#tmp2"; vinsertf128 $1,(%3,%4,1),%%ymm"#tmp2",%%ymm"#tmp2";"\
|
||||||
|
"vfmaddsub213pd %%ymm"#tmp2",%%ymm"#alpi",%%ymm"#tmp1"; vfmaddsub231pd %%ymm"#c",%%ymm"#alpr",%%ymm"#tmp1";"\
|
||||||
|
"vmovupd %%xmm"#tmp1",(%3); vextractf128 $1,%%ymm"#tmp1",(%3,%4,1); leaq (%3,%4,2),%3;"
|
||||||
|
#endif
|
||||||
|
#define save_init_m1 "movq %2,%3; addq $16,%2; vbroadcastsd (%6),%%ymm0; vbroadcastsd 8(%6),%%ymm1;"
|
||||||
|
#define SAVE_m1n1 save_init_m1 cont_expxmmacc(4,5,4) save_m1n1(4,2,0,1)
|
||||||
|
#define SAVE_m1n2 save_init_m1 cont_expacc(4,5,4) save_m1n2(4,2,3,0,1)
|
||||||
|
#define SAVE_m1n4 SAVE_m1n2 cont_expacc(6,7,6) save_m1n2(6,2,3,0,1)
|
||||||
|
#define SAVE_m1n6 SAVE_m1n4 cont_expacc(8,9,8) save_m1n2(8,2,3,0,1)
|
||||||
|
#define COMPUTE_m1(ndim) \
|
||||||
|
"movq %%r14,%1;" INIT_m1n##ndim "movq %%r13,%5;"\
|
||||||
|
"testq %5,%5; jz "#ndim"1112f;"\
|
||||||
|
#ndim"1111:\n\t"\
|
||||||
|
KERNEL_k1m1n##ndim\
|
||||||
|
"decq %5; jnz "#ndim"1111b;"\
|
||||||
|
#ndim"1112:\n\t"\
|
||||||
|
SAVE_m1n##ndim
|
||||||
|
|
||||||
|
#define COMPUTE(ndim) {\
|
||||||
|
b_pref = b_ptr + ndim * K *2;\
|
||||||
|
__asm__ __volatile__ (\
|
||||||
|
"movq %1,%%r14; movq %5,%%r13; movq %5,%%r12; salq $5,%%r12; movq %7,%%r11;"\
|
||||||
|
"cmpq $4,%7; jb "#ndim"9992f;"\
|
||||||
|
#ndim"9991:\n\t"\
|
||||||
|
COMPUTE_m4(ndim)\
|
||||||
|
"subq $4,%7; cmpq $4,%7; jnb "#ndim"9991b;"\
|
||||||
|
#ndim"9992:\n\t"\
|
||||||
|
"cmpq $2,%7; jb "#ndim"9993f;"\
|
||||||
|
COMPUTE_m2(ndim) "subq $2,%7;"\
|
||||||
|
#ndim"9993:\n\t"\
|
||||||
|
"testq %7,%7; jz "#ndim"9994f;"\
|
||||||
|
COMPUTE_m1(ndim)\
|
||||||
|
#ndim"9994:\n\t"\
|
||||||
|
"movq %%r14,%1; movq %%r13,%5; movq %%r11,%7; vzeroupper;"\
|
||||||
|
:"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_in_bytes),"+r"(K),"+r"(alp),"+r"(M),"+r"(b_pref)\
|
||||||
|
::"cc","memory","r11","r12","r13","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5",\
|
||||||
|
"xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\
|
||||||
|
a_ptr -= M * K *2; b_ptr += ndim * K *2; c_ptr += (ndim * LDC - M) * 2;\
|
||||||
|
}
|
||||||
|
|
||||||
|
int __attribute__ ((noinline))
|
||||||
|
CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alphar, double alphai, double * __restrict__ A, double * __restrict__ B, double * __restrict__ C, BLASLONG LDC)
|
||||||
|
{
|
||||||
|
if(m==0||n==0||k==0||(alphar==0.0 && alphai==0.0)) return 0;
|
||||||
|
int64_t ldc_in_bytes = (int64_t)LDC * sizeof(double) * 2;
|
||||||
|
#if A_CONJ == B_CONJ
|
||||||
|
double const_val[2] = {-alphar, -alphai};
|
||||||
|
#else
|
||||||
|
double const_val[2] = {alphar, alphai};
|
||||||
|
#endif
|
||||||
|
int64_t M = (int64_t)m, K = (int64_t)k;
|
||||||
|
BLASLONG n_count = n;
|
||||||
|
double *a_ptr = A,*b_ptr = B,*c_ptr = C,*c_tmp = C,*alp = const_val,*b_pref = B;
|
||||||
|
for(;n_count>5;n_count-=6) COMPUTE(6)
|
||||||
|
for(;n_count>3;n_count-=4) COMPUTE(4)
|
||||||
|
for(;n_count>1;n_count-=2) COMPUTE(2)
|
||||||
|
if(n_count>0) COMPUTE(1)
|
||||||
|
return 0;
|
||||||
|
}
|
32
param.h
32
param.h
|
@ -668,8 +668,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#define SGEMM_DEFAULT_P 768
|
#define SGEMM_DEFAULT_P 768
|
||||||
#define DGEMM_DEFAULT_P 512
|
#define DGEMM_DEFAULT_P 512
|
||||||
#define CGEMM_DEFAULT_P 384
|
#define CGEMM_DEFAULT_P 256
|
||||||
#define ZGEMM_DEFAULT_P 256
|
#define ZGEMM_DEFAULT_P 192
|
||||||
|
|
||||||
#ifdef WINDOWS_ABI
|
#ifdef WINDOWS_ABI
|
||||||
#define SGEMM_DEFAULT_Q 320
|
#define SGEMM_DEFAULT_Q 320
|
||||||
|
@ -678,8 +678,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define SGEMM_DEFAULT_Q 384
|
#define SGEMM_DEFAULT_Q 384
|
||||||
#define DGEMM_DEFAULT_Q 256
|
#define DGEMM_DEFAULT_Q 256
|
||||||
#endif
|
#endif
|
||||||
#define CGEMM_DEFAULT_Q 192
|
#define CGEMM_DEFAULT_Q 256
|
||||||
#define ZGEMM_DEFAULT_Q 128
|
#define ZGEMM_DEFAULT_Q 192
|
||||||
|
|
||||||
#define SGEMM_DEFAULT_R sgemm_r
|
#define SGEMM_DEFAULT_R sgemm_r
|
||||||
#define DGEMM_DEFAULT_R 13824
|
#define DGEMM_DEFAULT_R 13824
|
||||||
|
@ -693,15 +693,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define XGEMM_DEFAULT_R xgemm_r
|
#define XGEMM_DEFAULT_R xgemm_r
|
||||||
#define XGEMM_DEFAULT_Q 128
|
#define XGEMM_DEFAULT_Q 128
|
||||||
|
|
||||||
#define CGEMM3M_DEFAULT_UNROLL_N 8
|
#define CGEMM3M_DEFAULT_UNROLL_N 4
|
||||||
#define CGEMM3M_DEFAULT_UNROLL_M 4
|
#define CGEMM3M_DEFAULT_UNROLL_M 8
|
||||||
#define ZGEMM3M_DEFAULT_UNROLL_N 8
|
#define ZGEMM3M_DEFAULT_UNROLL_N 8
|
||||||
#define ZGEMM3M_DEFAULT_UNROLL_M 2
|
#define ZGEMM3M_DEFAULT_UNROLL_M 2
|
||||||
|
|
||||||
#define CGEMM3M_DEFAULT_P 448
|
#define CGEMM3M_DEFAULT_P 320
|
||||||
#define ZGEMM3M_DEFAULT_P 224
|
#define ZGEMM3M_DEFAULT_P 224
|
||||||
#define XGEMM3M_DEFAULT_P 112
|
#define XGEMM3M_DEFAULT_P 112
|
||||||
#define CGEMM3M_DEFAULT_Q 224
|
#define CGEMM3M_DEFAULT_Q 320
|
||||||
#define ZGEMM3M_DEFAULT_Q 224
|
#define ZGEMM3M_DEFAULT_Q 224
|
||||||
#define XGEMM3M_DEFAULT_Q 224
|
#define XGEMM3M_DEFAULT_Q 224
|
||||||
#define CGEMM3M_DEFAULT_R 12288
|
#define CGEMM3M_DEFAULT_R 12288
|
||||||
|
@ -1571,8 +1571,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#define SGEMM_DEFAULT_P 768
|
#define SGEMM_DEFAULT_P 768
|
||||||
#define DGEMM_DEFAULT_P 512
|
#define DGEMM_DEFAULT_P 512
|
||||||
#define CGEMM_DEFAULT_P 384
|
#define CGEMM_DEFAULT_P 256
|
||||||
#define ZGEMM_DEFAULT_P 256
|
#define ZGEMM_DEFAULT_P 192
|
||||||
|
|
||||||
#ifdef WINDOWS_ABI
|
#ifdef WINDOWS_ABI
|
||||||
#define SGEMM_DEFAULT_Q 320
|
#define SGEMM_DEFAULT_Q 320
|
||||||
|
@ -1581,8 +1581,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define SGEMM_DEFAULT_Q 384
|
#define SGEMM_DEFAULT_Q 384
|
||||||
#define DGEMM_DEFAULT_Q 256
|
#define DGEMM_DEFAULT_Q 256
|
||||||
#endif
|
#endif
|
||||||
#define CGEMM_DEFAULT_Q 192
|
#define CGEMM_DEFAULT_Q 256
|
||||||
#define ZGEMM_DEFAULT_Q 128
|
#define ZGEMM_DEFAULT_Q 192
|
||||||
|
|
||||||
#define SGEMM_DEFAULT_R sgemm_r
|
#define SGEMM_DEFAULT_R sgemm_r
|
||||||
#define DGEMM_DEFAULT_R 13824
|
#define DGEMM_DEFAULT_R 13824
|
||||||
|
@ -1596,15 +1596,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define XGEMM_DEFAULT_R xgemm_r
|
#define XGEMM_DEFAULT_R xgemm_r
|
||||||
#define XGEMM_DEFAULT_Q 128
|
#define XGEMM_DEFAULT_Q 128
|
||||||
|
|
||||||
#define CGEMM3M_DEFAULT_UNROLL_N 8
|
#define CGEMM3M_DEFAULT_UNROLL_N 4
|
||||||
#define CGEMM3M_DEFAULT_UNROLL_M 4
|
#define CGEMM3M_DEFAULT_UNROLL_M 8
|
||||||
#define ZGEMM3M_DEFAULT_UNROLL_N 8
|
#define ZGEMM3M_DEFAULT_UNROLL_N 8
|
||||||
#define ZGEMM3M_DEFAULT_UNROLL_M 2
|
#define ZGEMM3M_DEFAULT_UNROLL_M 2
|
||||||
|
|
||||||
#define CGEMM3M_DEFAULT_P 448
|
#define CGEMM3M_DEFAULT_P 320
|
||||||
#define ZGEMM3M_DEFAULT_P 224
|
#define ZGEMM3M_DEFAULT_P 224
|
||||||
#define XGEMM3M_DEFAULT_P 112
|
#define XGEMM3M_DEFAULT_P 112
|
||||||
#define CGEMM3M_DEFAULT_Q 224
|
#define CGEMM3M_DEFAULT_Q 320
|
||||||
#define ZGEMM3M_DEFAULT_Q 224
|
#define ZGEMM3M_DEFAULT_Q 224
|
||||||
#define XGEMM3M_DEFAULT_Q 224
|
#define XGEMM3M_DEFAULT_Q 224
|
||||||
#define CGEMM3M_DEFAULT_R 12288
|
#define CGEMM3M_DEFAULT_R 12288
|
||||||
|
|
Loading…
Reference in New Issue