commit
525db5401c
|
@ -56,6 +56,16 @@ CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
|||
FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), THUNDERX3T110)
|
||||
ifeq ($(GCCVERSIONGTEQ10), 1)
|
||||
CCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110
|
||||
FCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
||||
FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(GCCVERSIONGTEQ9), 1)
|
||||
ifeq ($(CORE), TSV110)
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
|
||||
|
|
|
@ -11,34 +11,34 @@ endif
|
|||
|
||||
ifeq ($(CORE), POWER10)
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
|
||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
|
||||
COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -DUSE_OPENMP -fno-fast-math -fopenmp
|
||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -DUSE_OPENMP -fno-fast-math -fopenmp
|
||||
else
|
||||
COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -malign-power -fno-fast-math
|
||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -malign-power -fno-fast-math
|
||||
COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
|
||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), POWER9)
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
|
||||
CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -DUSE_OPENMP -fno-fast-math -fopenmp
|
||||
else
|
||||
CCOMMON_OPT += -fast -Mvect=simd -Mcache_align -DUSE_OPENMP -mp
|
||||
endif
|
||||
ifneq ($(F_COMPILER), PGI)
|
||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
|
||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -DUSE_OPENMP -fno-fast-math -fopenmp
|
||||
else
|
||||
FCOMMON_OPT += -O2 -Mrecursive -DUSE_OPENMP -mp
|
||||
endif
|
||||
else
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -fno-fast-math
|
||||
CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math
|
||||
else
|
||||
CCOMMON_OPT += -fast -Mvect=simd -Mcache_align
|
||||
endif
|
||||
ifneq ($(F_COMPILER), PGI)
|
||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -fno-fast-math
|
||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -fno-fast-math
|
||||
else
|
||||
FCOMMON_OPT += -O2 -Mrecursive
|
||||
endif
|
||||
|
@ -48,26 +48,26 @@ endif
|
|||
ifeq ($(CORE), POWER8)
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
|
||||
CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -DUSE_OPENMP -fno-fast-math -fopenmp
|
||||
else
|
||||
CCOMMON_OPT += -fast -Mvect=simd -Mcache_align -DUSE_OPENMP -mp
|
||||
endif
|
||||
ifneq ($(F_COMPILER), PGI)
|
||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
|
||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -DUSE_OPENMP -fno-fast-math -fopenmp
|
||||
else
|
||||
FCOMMON_OPT += -O2 -Mrecursive -DUSE_OPENMP -mp
|
||||
endif
|
||||
else
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -fno-fast-math
|
||||
CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math
|
||||
else
|
||||
CCOMMON_OPT += -fast -Mvect=simd -Mcache_align
|
||||
endif
|
||||
ifneq ($(F_COMPILER), PGI)
|
||||
ifeq ($(OSNAME), AIX)
|
||||
FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math
|
||||
FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math
|
||||
else
|
||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math
|
||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math
|
||||
endif
|
||||
else
|
||||
FCOMMON_OPT += -O2 -Mrecursive
|
||||
|
|
|
@ -578,6 +578,7 @@ DYNAMIC_CORE += THUNDERX
|
|||
DYNAMIC_CORE += THUNDERX2T99
|
||||
DYNAMIC_CORE += TSV110
|
||||
DYNAMIC_CORE += EMAG8180
|
||||
DYNAMIC_CORE += THUNDERX3T110
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), zarch)
|
||||
|
@ -617,7 +618,6 @@ DYNAMIC_CORE += POWER8
|
|||
ifneq ($(C_COMPILER), GCC)
|
||||
DYNAMIC_CORE += POWER9
|
||||
DYNAMIC_CORE += POWER10
|
||||
override LDFLAGS += -Wl,-no-power10-stubs
|
||||
endif
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
ifeq ($(GCCVERSIONGT5), 1)
|
||||
|
@ -627,11 +627,9 @@ $(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.)
|
|||
endif
|
||||
ifeq ($(GCCVERSIONGTEQ11), 1)
|
||||
DYNAMIC_CORE += POWER10
|
||||
override LDFLAGS += -Wl,-no-power10-stubs
|
||||
else ifeq ($(GCCVERSIONGTEQ10), 1)
|
||||
ifeq ($(GCCMINORVERSIONGTEQ2), 1)
|
||||
DYNAMIC_CORE += POWER10
|
||||
override LDFLAGS += -Wl,-no-power10-stubs
|
||||
endif
|
||||
else
|
||||
$(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.)
|
||||
|
@ -1241,7 +1239,9 @@ KERNELDIR = $(TOPDIR)/kernel/$(ARCH)
|
|||
|
||||
include $(TOPDIR)/Makefile.$(ARCH)
|
||||
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
CCOMMON_OPT += -UASMNAME -UASMFNAME -UNAME -UCNAME -UCHAR_NAME -UCHAR_CNAME
|
||||
endif
|
||||
CCOMMON_OPT += -DASMNAME=$(FU)$(*F) -DASMFNAME=$(FU)$(*F)$(BU) -DNAME=$(*F)$(BU) -DCNAME=$(*F) -DCHAR_NAME=\"$(*F)$(BU)\" -DCHAR_CNAME=\"$(*F)\"
|
||||
|
||||
ifeq ($(CORE), PPC440)
|
||||
|
|
|
@ -28,7 +28,8 @@ You can download them from [file hosting on sourceforge.net](https://sourceforge
|
|||
## Installation from Source
|
||||
|
||||
Download from project homepage, https://xianyi.github.com/OpenBLAS/, or check out the code
|
||||
using Git from https://github.com/xianyi/OpenBLAS.git.
|
||||
using Git from https://github.com/xianyi/OpenBLAS.git. (If you want the most up to date version, be
|
||||
sure to use the develop branch - master is several years out of date due to a change of maintainership.)
|
||||
Buildtime parameters can be chosen in Makefile.rule, see there for a short description of each option.
|
||||
Most can also be given directly on the make or cmake command line.
|
||||
|
||||
|
|
|
@ -96,6 +96,7 @@ FALKOR
|
|||
THUNDERX
|
||||
THUNDERX2T99
|
||||
TSV110
|
||||
THUNDERX3T110
|
||||
|
||||
9.System Z:
|
||||
ZARCH_GENERIC
|
||||
|
|
|
@ -45,7 +45,7 @@ endif ()
|
|||
|
||||
if (DYNAMIC_ARCH)
|
||||
if (ARM64)
|
||||
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1)
|
||||
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
|
||||
endif ()
|
||||
|
||||
if (POWER)
|
||||
|
|
|
@ -195,8 +195,13 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
|
|||
"#define HAVE_VFP\n"
|
||||
"#define HAVE_NEON\n"
|
||||
"#define ARMV8\n")
|
||||
if ("${TCORE}" STREQUAL "CORTEXA57")
|
||||
set(SGEMM_UNROLL_M 16)
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
else ()
|
||||
set(SGEMM_UNROLL_M 8)
|
||||
set(SGEMM_UNROLL_N 8)
|
||||
endif ()
|
||||
set(DGEMM_UNROLL_M 8)
|
||||
set(DGEMM_UNROLL_N 4)
|
||||
set(CGEMM_UNROLL_M 8)
|
||||
|
@ -338,6 +343,33 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
|
|||
set(ZGEMM_UNROLL_M 4)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
set(SYMV_P 16)
|
||||
elseif ("${TCORE}" STREQUAL "THUNDERX3T110")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define THUNDERX3T110\n"
|
||||
"#define L1_CODE_SIZE\t65536\n"
|
||||
"#define L1_CODE_LINESIZE\t64\n"
|
||||
"#define L1_CODE_ASSOCIATIVE\t8\n"
|
||||
"#define L1_DATA_SIZE\t65536\n"
|
||||
"#define L1_DATA_LINESIZE\t64\n"
|
||||
"#define L1_DATA_ASSOCIATIVE\t8\n"
|
||||
"#define L2_SIZE\t524288\n"
|
||||
"#define L2_LINESIZE\t64\n"
|
||||
"#define L2_ASSOCIATIVE\t8\n"
|
||||
"#define L3_SIZE\t94371840\n"
|
||||
"#define L3_LINESIZE\t64\n"
|
||||
"#define L3_ASSOCIATIVE\t32\n"
|
||||
"#define DTB_DEFAULT_ENTRIES\t64\n"
|
||||
"#define DTB_SIZE\t4096\n"
|
||||
"#define ARMV8\n")
|
||||
set(SGEMM_UNROLL_M 16)
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
set(DGEMM_UNROLL_M 8)
|
||||
set(DGEMM_UNROLL_N 4)
|
||||
set(CGEMM_UNROLL_M 8)
|
||||
set(CGEMM_UNROLL_N 4)
|
||||
set(ZGEMM_UNROLL_M 4)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
set(SYMV_P 16)
|
||||
elseif ("${TCORE}" STREQUAL "TSV110")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define ARMV8\n"
|
||||
|
|
|
@ -40,6 +40,7 @@
|
|||
// Cavium
|
||||
#define CPU_THUNDERX 7
|
||||
#define CPU_THUNDERX2T99 8
|
||||
#define CPU_THUNDERX3T110 12
|
||||
//Hisilicon
|
||||
#define CPU_TSV110 9
|
||||
// Ampere
|
||||
|
@ -57,7 +58,8 @@ static char *cpuname[] = {
|
|||
"THUNDERX2T99",
|
||||
"TSV110",
|
||||
"EMAG8180",
|
||||
"NEOVERSEN1"
|
||||
"NEOVERSEN1",
|
||||
"THUNDERX3T110"
|
||||
};
|
||||
|
||||
static char *cpuname_lower[] = {
|
||||
|
@ -72,7 +74,8 @@ static char *cpuname_lower[] = {
|
|||
"thunderx2t99",
|
||||
"tsv110",
|
||||
"emag8180",
|
||||
"neoversen1"
|
||||
"neoversen1",
|
||||
"thunderx3t110"
|
||||
};
|
||||
|
||||
int get_feature(char *search)
|
||||
|
@ -158,6 +161,8 @@ int detect(void)
|
|||
return CPU_THUNDERX;
|
||||
else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0af"))
|
||||
return CPU_THUNDERX2T99;
|
||||
else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0b8"))
|
||||
return CPU_THUNDERX3T110;
|
||||
// HiSilicon
|
||||
else if (strstr(cpu_implementer, "0x48") && strstr(cpu_part, "0xd01"))
|
||||
return CPU_TSV110;
|
||||
|
@ -372,7 +377,25 @@ void get_cpuconfig(void)
|
|||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
break;
|
||||
|
||||
case CPU_THUNDERX3T110:
|
||||
printf("#define THUNDERX3T110 \n");
|
||||
printf("#define L1_CODE_SIZE 65536 \n");
|
||||
printf("#define L1_CODE_LINESIZE 64 \n");
|
||||
printf("#define L1_CODE_ASSOCIATIVE 8 \n");
|
||||
printf("#define L1_DATA_SIZE 32768 \n");
|
||||
printf("#define L1_DATA_LINESIZE 64 \n");
|
||||
printf("#define L1_DATA_ASSOCIATIVE 8 \n");
|
||||
printf("#define L2_SIZE 524288 \n");
|
||||
printf("#define L2_LINESIZE 64 \n");
|
||||
printf("#define L2_ASSOCIATIVE 8 \n");
|
||||
printf("#define L3_SIZE 94371840 \n");
|
||||
printf("#define L3_LINESIZE 64 \n");
|
||||
printf("#define L3_ASSOCIATIVE 32 \n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
|
||||
printf("#define DTB_SIZE 4096 \n");
|
||||
break;
|
||||
}
|
||||
get_cpucount();
|
||||
}
|
||||
|
|
21
cpuid_x86.c
21
cpuid_x86.c
|
@ -1454,10 +1454,11 @@ int get_cpuname(void){
|
|||
return CPUTYPE_OPTERON;
|
||||
case 1:
|
||||
case 3:
|
||||
case 7:
|
||||
case 10:
|
||||
// case 7:
|
||||
// case 10:
|
||||
return CPUTYPE_BARCELONA;
|
||||
case 5:
|
||||
case 7:
|
||||
return CPUTYPE_BOBCAT;
|
||||
case 6:
|
||||
switch (model) {
|
||||
|
@ -1507,6 +1508,8 @@ int get_cpuname(void){
|
|||
// AMD Ryzen
|
||||
case 8:
|
||||
// AMD Ryzen2
|
||||
default:
|
||||
// Matisse/Renoir and other recent Ryzen2
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CPUTYPE_ZEN;
|
||||
|
@ -1516,6 +1519,16 @@ int get_cpuname(void){
|
|||
else
|
||||
return CPUTYPE_BARCELONA;
|
||||
}
|
||||
break;
|
||||
case 10: // Zen3
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CPUTYPE_ZEN;
|
||||
#else
|
||||
return CPUTYPE_SANDYBRIDGE; // Zen is closer in architecture to Sandy Bridge than to Excavator
|
||||
#endif
|
||||
else
|
||||
return CPUTYPE_BARCELONA;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -2135,12 +2148,14 @@ int get_coretype(void){
|
|||
}
|
||||
break;
|
||||
}
|
||||
} else if (exfamily == 8) {
|
||||
} else if (exfamily == 8 || exfamily == 10) {
|
||||
switch (model) {
|
||||
case 1:
|
||||
// AMD Ryzen
|
||||
case 8:
|
||||
// Ryzen 2
|
||||
default:
|
||||
// Matisse,Renoir Ryzen2 models
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CORE_ZEN;
|
||||
|
|
|
@ -656,7 +656,7 @@ static gotoblas_t *get_coretype(void){
|
|||
if ((exfamily == 0) || (exfamily == 2)) {
|
||||
if (ecx & (1 << 0)) return &gotoblas_OPTERON_SSE3;
|
||||
else return &gotoblas_OPTERON;
|
||||
} else if (exfamily == 5) {
|
||||
} else if (exfamily == 5 || exfamily == 7) {
|
||||
return &gotoblas_BOBCAT;
|
||||
} else if (exfamily == 6) {
|
||||
if(model == 1){
|
||||
|
@ -710,7 +710,7 @@ static gotoblas_t *get_coretype(void){
|
|||
}
|
||||
}
|
||||
} else if (exfamily == 8) {
|
||||
if (model == 1 || model == 8) {
|
||||
/* if (model == 1 || model == 8) */ {
|
||||
if(support_avx())
|
||||
return &gotoblas_ZEN;
|
||||
else{
|
||||
|
@ -725,9 +725,17 @@ static gotoblas_t *get_coretype(void){
|
|||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
} else if (exfamily == 10) {
|
||||
if(support_avx())
|
||||
return &gotoblas_ZEN;
|
||||
else{
|
||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}else {
|
||||
return &gotoblas_BARCELONA;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -53,10 +53,11 @@ extern gotoblas_t gotoblas_THUNDERX2T99;
|
|||
extern gotoblas_t gotoblas_TSV110;
|
||||
extern gotoblas_t gotoblas_EMAG8180;
|
||||
extern gotoblas_t gotoblas_NEOVERSEN1;
|
||||
extern gotoblas_t gotoblas_THUNDERX3T110;
|
||||
|
||||
extern void openblas_warning(int verbose, const char * msg);
|
||||
|
||||
#define NUM_CORETYPES 11
|
||||
#define NUM_CORETYPES 12
|
||||
|
||||
/*
|
||||
* In case asm/hwcap.h is outdated on the build system, make sure
|
||||
|
@ -82,6 +83,7 @@ static char *corename[] = {
|
|||
"tsv110",
|
||||
"emag8180",
|
||||
"neoversen1",
|
||||
"thunderx3t110",
|
||||
"unknown"
|
||||
};
|
||||
|
||||
|
@ -97,6 +99,7 @@ char *gotoblas_corename(void) {
|
|||
if (gotoblas == &gotoblas_TSV110) return corename[ 8];
|
||||
if (gotoblas == &gotoblas_EMAG8180) return corename[ 9];
|
||||
if (gotoblas == &gotoblas_NEOVERSEN1) return corename[10];
|
||||
if (gotoblas == &gotoblas_THUNDERX3T110) return corename[11];
|
||||
return corename[NUM_CORETYPES];
|
||||
}
|
||||
|
||||
|
@ -127,6 +130,7 @@ static gotoblas_t *force_coretype(char *coretype) {
|
|||
case 8: return (&gotoblas_TSV110);
|
||||
case 9: return (&gotoblas_EMAG8180);
|
||||
case 10: return (&gotoblas_NEOVERSEN1);
|
||||
case 11: return (&gotoblas_THUNDERX3T110);
|
||||
}
|
||||
snprintf(message, 128, "Core not found: %s\n", coretype);
|
||||
openblas_warning(1, message);
|
||||
|
@ -190,6 +194,8 @@ static gotoblas_t *get_coretype(void) {
|
|||
return &gotoblas_THUNDERX;
|
||||
case 0x0af: // ThunderX2
|
||||
return &gotoblas_THUNDERX2T99;
|
||||
case 0x0b8: // ThunderX3
|
||||
return &gotoblas_THUNDERX3T110;
|
||||
}
|
||||
break;
|
||||
case 0x48: // HiSilicon
|
||||
|
|
18
getarch.c
18
getarch.c
|
@ -1174,6 +1174,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define CORENAME "EMAG8180"
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_THUNDERX3T110
|
||||
#define ARMV8
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ARM64"
|
||||
#define SUBARCHITECTURE "THUNDERX3T110"
|
||||
#define SUBDIRNAME "arm64"
|
||||
#define ARCHCONFIG "-DTHUNDERX3T110 " \
|
||||
"-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \
|
||||
"-DL2_SIZE=524288 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \
|
||||
"-DL3_SIZE=94371840 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
|
||||
#define LIBNAME "thunderx3t110"
|
||||
#define CORENAME "THUNDERX3T110"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_ZARCH_GENERIC
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ZARCH"
|
||||
|
|
|
@ -42,7 +42,7 @@
|
|||
#include "functable.h"
|
||||
#endif
|
||||
|
||||
#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8)
|
||||
#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) || defined(THUNDERX3T110)
|
||||
// Multithreaded swap gives performance benefits in ThunderX2T99
|
||||
#else
|
||||
// Disable multi-threading as it does not show any performance
|
||||
|
|
|
@ -42,7 +42,7 @@
|
|||
#include "functable.h"
|
||||
#endif
|
||||
|
||||
#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8)
|
||||
#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) || defined(THUNDERX3T110)
|
||||
// Multithreaded swap gives performance benefits in ThunderX2T99
|
||||
#else
|
||||
// Disable multi-threading as it does not show any performance
|
||||
|
|
|
@ -10,6 +10,11 @@ ifeq ($(C_COMPILER), GCC)
|
|||
GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), power)
|
||||
ifeq ($(C_COMPILER), CLANG)
|
||||
override CFLAGS += -fno-integrated-as
|
||||
endif
|
||||
endif
|
||||
AVX2OPT =
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
# AVX2 support was added in 4.7.0
|
||||
|
|
|
@ -44,8 +44,10 @@ USE_TRMM = 1
|
|||
endif
|
||||
|
||||
ifeq ($(CORE), POWER8)
|
||||
ifeq ($(BINARY64),1)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), POWER9)
|
||||
USE_TRMM = 1
|
||||
|
|
|
@ -48,10 +48,12 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
|
|||
|
||||
dot[0]=0.0;
|
||||
dot[1]=0.0;
|
||||
|
||||
#if !defined(__PPC__)
|
||||
CREAL(result) = 0.0 ;
|
||||
CIMAG(result) = 0.0 ;
|
||||
|
||||
#else
|
||||
result = OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0);
|
||||
#endif
|
||||
if ( n < 1 ) return(result);
|
||||
|
||||
inc_x2 = 2 * inc_x ;
|
||||
|
@ -71,8 +73,12 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
|
|||
i++ ;
|
||||
|
||||
}
|
||||
#if !defined(__POWER__)
|
||||
CREAL(result) = dot[0];
|
||||
CIMAG(result) = dot[1];
|
||||
#else
|
||||
result = OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0],dot[1]);
|
||||
#endif
|
||||
return(result);
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,184 @@
|
|||
SAMINKERNEL = ../arm/amin.c
|
||||
DAMINKERNEL = ../arm/amin.c
|
||||
CAMINKERNEL = ../arm/zamin.c
|
||||
ZAMINKERNEL = ../arm/zamin.c
|
||||
|
||||
SMAXKERNEL = ../arm/max.c
|
||||
DMAXKERNEL = ../arm/max.c
|
||||
|
||||
SMINKERNEL = ../arm/min.c
|
||||
DMINKERNEL = ../arm/min.c
|
||||
|
||||
ISAMINKERNEL = ../arm/iamin.c
|
||||
IDAMINKERNEL = ../arm/iamin.c
|
||||
ICAMINKERNEL = ../arm/izamin.c
|
||||
IZAMINKERNEL = ../arm/izamin.c
|
||||
|
||||
ISMAXKERNEL = ../arm/imax.c
|
||||
IDMAXKERNEL = ../arm/imax.c
|
||||
|
||||
ISMINKERNEL = ../arm/imin.c
|
||||
IDMINKERNEL = ../arm/imin.c
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
SAMAXKERNEL = amax.S
|
||||
DAMAXKERNEL = amax.S
|
||||
CAMAXKERNEL = zamax.S
|
||||
ZAMAXKERNEL = zamax.S
|
||||
|
||||
SAXPYKERNEL = axpy.S
|
||||
DAXPYKERNEL = daxpy_thunderx2t99.S
|
||||
CAXPYKERNEL = zaxpy.S
|
||||
ZAXPYKERNEL = zaxpy.S
|
||||
|
||||
SROTKERNEL = rot.S
|
||||
DROTKERNEL = rot.S
|
||||
CROTKERNEL = zrot.S
|
||||
ZROTKERNEL = zrot.S
|
||||
|
||||
SSCALKERNEL = scal.S
|
||||
DSCALKERNEL = scal.S
|
||||
CSCALKERNEL = zscal.S
|
||||
ZSCALKERNEL = zscal.S
|
||||
|
||||
SGEMVNKERNEL = gemv_n.S
|
||||
DGEMVNKERNEL = gemv_n.S
|
||||
CGEMVNKERNEL = zgemv_n.S
|
||||
ZGEMVNKERNEL = zgemv_n.S
|
||||
|
||||
SGEMVTKERNEL = gemv_t.S
|
||||
DGEMVTKERNEL = gemv_t.S
|
||||
CGEMVTKERNEL = zgemv_t.S
|
||||
ZGEMVTKERNEL = zgemv_t.S
|
||||
|
||||
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||
|
||||
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_M), 8)
|
||||
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
|
||||
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
|
||||
else
|
||||
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
|
||||
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
|
||||
endif
|
||||
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_N), 4)
|
||||
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
|
||||
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
|
||||
else
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
|
||||
endif
|
||||
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
SASUMKERNEL = sasum_thunderx2t99.c
|
||||
DASUMKERNEL = dasum_thunderx2t99.c
|
||||
CASUMKERNEL = casum_thunderx2t99.c
|
||||
ZASUMKERNEL = zasum_thunderx2t99.c
|
||||
|
||||
SCOPYKERNEL = copy_thunderx2t99.c
|
||||
DCOPYKERNEL = copy_thunderx2t99.c
|
||||
CCOPYKERNEL = copy_thunderx2t99.c
|
||||
ZCOPYKERNEL = copy_thunderx2t99.c
|
||||
|
||||
SSWAPKERNEL = swap_thunderx2t99.S
|
||||
DSWAPKERNEL = swap_thunderx2t99.S
|
||||
CSWAPKERNEL = swap_thunderx2t99.S
|
||||
ZSWAPKERNEL = swap_thunderx2t99.S
|
||||
|
||||
ISAMAXKERNEL = iamax_thunderx2t99.c
|
||||
IDAMAXKERNEL = iamax_thunderx2t99.c
|
||||
ICAMAXKERNEL = izamax_thunderx2t99.c
|
||||
IZAMAXKERNEL = izamax_thunderx2t99.c
|
||||
|
||||
SNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||
CNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||
#DNRM2KERNEL = dznrm2_thunderx2t99_fast.c
|
||||
#ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c
|
||||
DNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||
ZNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||
|
||||
|
||||
DDOTKERNEL = dot_thunderx2t99.c
|
||||
SDOTKERNEL = dot_thunderx2t99.c
|
||||
CDOTKERNEL = zdot_thunderx2t99.c
|
||||
ZDOTKERNEL = zdot_thunderx2t99.c
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4)
|
||||
DGEMMKERNEL = dgemm_kernel_8x4_thunderx2t99.S
|
||||
endif
|
||||
|
||||
ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 16x4)
|
||||
SGEMMKERNEL = sgemm_kernel_16x4_thunderx2t99.S
|
||||
endif
|
||||
|
||||
ifeq ($(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N), 8x4)
|
||||
CGEMMKERNEL = cgemm_kernel_8x4_thunderx2t99.S
|
||||
endif
|
||||
|
||||
ifeq ($(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N), 4x4)
|
||||
ZGEMMKERNEL = zgemm_kernel_4x4_thunderx2t99.S
|
||||
endif
|
|
@ -1,3 +1,44 @@
|
|||
# Big-endian 32bit (AIX) is supported through the POWER6 GEMM kernels, no separate TRMM
|
||||
ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1)
|
||||
SGEMMKERNEL = gemm_kernel_power6.S
|
||||
SGEMMINCOPY =
|
||||
SGEMMITCOPY =
|
||||
SGEMMONCOPY = gemm_ncopy_4.S
|
||||
SGEMMOTCOPY = gemm_tcopy_4.S
|
||||
SGEMMINCOPYOBJ =
|
||||
SGEMMITCOPYOBJ =
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMKERNEL = gemm_kernel_power6.S
|
||||
DGEMMINCOPY =
|
||||
DGEMMITCOPY =
|
||||
DGEMMONCOPY = gemm_ncopy_4.S
|
||||
DGEMMOTCOPY = gemm_tcopy_4.S
|
||||
DGEMMINCOPYOBJ =
|
||||
DGEMMITCOPYOBJ =
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMKERNEL = zgemm_kernel_power6.S
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_2.c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_2.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMKERNEL = zgemm_kernel_power6.S
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_2.c
|
||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_2.c
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
else
|
||||
|
||||
#SGEMM_BETA = ../generic/gemm_beta.c
|
||||
#DGEMM_BETA = ../generic/gemm_beta.c
|
||||
#CGEMM_BETA = ../generic/zgemm_beta.c
|
||||
|
@ -47,16 +88,24 @@ ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
|||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1)
|
||||
DTRSMKERNEL_LN = trsm_kernel_power6_LN.S
|
||||
DTRSMKERNEL_LT = trsm_kernel_power6_LT.S
|
||||
DTRSMKERNEL_RN = trsm_kernel_power6_LT.S
|
||||
DTRSMKERNEL_RT = trsm_kernel_power6_RT.S
|
||||
else
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
endif
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
|
@ -153,6 +202,10 @@ ZASUMKERNEL = zasum.c
|
|||
#
|
||||
SAXPYKERNEL = saxpy.c
|
||||
DAXPYKERNEL = daxpy.c
|
||||
#
|
||||
ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1)
|
||||
CAXPYKERNEL = zaxpy.S
|
||||
else
|
||||
ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__)
|
||||
ifneq ($(GCCVERSIONGTEQ9),1)
|
||||
CAXPYKERNEL = caxpy_power8.S
|
||||
|
@ -162,6 +215,7 @@ endif
|
|||
else
|
||||
CAXPYKERNEL = caxpy.c
|
||||
endif
|
||||
endif
|
||||
#
|
||||
ZAXPYKERNEL = zaxpy.c
|
||||
#
|
||||
|
@ -239,4 +293,3 @@ IDAMINKERNEL = ../arm/iamin.c
|
|||
IZAMAXKERNEL = ../arm/izamax.c
|
||||
IZAMINKERNEL = ../arm/izamin.c
|
||||
endif
|
||||
|
||||
|
|
|
@ -47,8 +47,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "casum_microk_power8.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_16
|
||||
|
|
|
@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "common.h"
|
||||
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "ccopy_microk_power8.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_KERNEL_32
|
||||
|
||||
|
|
|
@ -23,6 +23,9 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
#if !defined(__VEC__) || !defined(__ALTIVEC__)
|
||||
#include "../arm/zdot.c"
|
||||
#else
|
||||
|
||||
#include "common.h"
|
||||
#ifndef HAVE_KERNEL_8
|
||||
|
@ -168,3 +171,4 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
|
|||
return (result);
|
||||
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -24,6 +24,9 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
#if !defined(__VEC__) || !defined(__ALTIVEC__)
|
||||
#include "../arm/zgemv_n.c"
|
||||
#else
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
|
@ -591,4 +594,4 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
|||
|
||||
return (0);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -24,6 +24,9 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
#if !defined(__VEC__) || !defined(__ALTIVEC__)
|
||||
#include "../arm/zgemv_t.c"
|
||||
#else
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
@ -595,4 +598,4 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
|||
return (0);
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "common.h"
|
||||
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
|
||||
static void crot_kernel_8 (long n, float *x, float *y, float c, float s)
|
||||
{
|
||||
|
@ -169,6 +170,7 @@ static void crot_kernel_8 (long n, float *x, float *y, float c, float s)
|
|||
}
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
|
||||
|
@ -183,7 +185,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
|||
|
||||
if ( (inc_x == 1) && (inc_y == 1) )
|
||||
{
|
||||
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
BLASLONG n1 = n & -8;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
|
@ -191,7 +193,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
|||
i=n1;
|
||||
ix=2*n1;
|
||||
}
|
||||
|
||||
#endif
|
||||
while(i < n)
|
||||
{
|
||||
temp[0] = c*x[ix] + s*y[ix] ;
|
||||
|
|
|
@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "cswap_microk_power8.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_32
|
||||
|
|
|
@ -47,8 +47,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "dasum_microk_power8.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_16
|
||||
|
|
|
@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "daxpy_microk_power8.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_8
|
||||
|
|
|
@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "common.h"
|
||||
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "dcopy_microk_power8.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_KERNEL_32
|
||||
|
||||
|
|
|
@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "ddot_microk_power8.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_8
|
||||
|
|
|
@ -27,64 +27,64 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "common.h"
|
||||
#include <altivec.h>
|
||||
|
||||
typedef unsigned char vec_t __attribute__ ((vector_size (16)));
|
||||
typedef __vector unsigned char vec_t;
|
||||
typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
|
||||
typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
|
||||
|
||||
#ifdef TRMMKERNEL
|
||||
#define SAVE_ACC(ACC, J) \
|
||||
__builtin_mma_disassemble_acc (result, ACC); \
|
||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||
rowC = (v4sf_t *) &CO[0* ldc+J]; \
|
||||
rowC[0] = result[3] * alpha; \
|
||||
rowC[0] = result[0] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[1*ldc+J]; \
|
||||
rowC[0] = result[2] * alpha; \
|
||||
rowC[0] = result[1] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[2*ldc+J]; \
|
||||
rowC[0] = result[1] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[3*ldc+J]; \
|
||||
rowC[0] = result[0] * alpha;
|
||||
#define SAVE_ACC1(ACC, J) \
|
||||
__builtin_mma_disassemble_acc (result, ACC); \
|
||||
rowC = (v4sf_t *) &CO[4* ldc+J]; \
|
||||
rowC[0] = result[3] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[5*ldc+J]; \
|
||||
rowC[0] = result[2] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[6*ldc+J]; \
|
||||
rowC = (v4sf_t *) &CO[3*ldc+J]; \
|
||||
rowC[0] = result[3] * alpha;
|
||||
#define SAVE_ACC1(ACC, J) \
|
||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||
rowC = (v4sf_t *) &CO[4* ldc+J]; \
|
||||
rowC[0] = result[0] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[5*ldc+J]; \
|
||||
rowC[0] = result[1] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[6*ldc+J]; \
|
||||
rowC[0] = result[2] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[7*ldc+J]; \
|
||||
rowC[0] = result[0] * alpha;
|
||||
rowC[0] = result[3] * alpha;
|
||||
#define SAVE2x4_ACC(ACC, J) \
|
||||
__builtin_mma_disassemble_acc (result, ACC); \
|
||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||
rowC = (v4sf_t *) &CO[0* ldc+J]; \
|
||||
rowC[0] = result[3] * alpha; \
|
||||
rowC[0] = result[0] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[1* ldc+J]; \
|
||||
rowC[0] = result[2] * alpha;
|
||||
rowC[0] = result[1] * alpha;
|
||||
#else
|
||||
#define SAVE_ACC(ACC, J) \
|
||||
__builtin_mma_disassemble_acc (result, ACC); \
|
||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||
rowC = (v4sf_t *) &CO[0* ldc+J]; \
|
||||
rowC[0] += result[3] * alpha; \
|
||||
rowC[0] += result[0] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[1*ldc+J]; \
|
||||
rowC[0] += result[2] * alpha; \
|
||||
rowC[0] += result[1] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[2*ldc+J]; \
|
||||
rowC[0] += result[1] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[3*ldc+J]; \
|
||||
rowC[0] += result[0] * alpha;
|
||||
#define SAVE_ACC1(ACC, J) \
|
||||
__builtin_mma_disassemble_acc (result, ACC); \
|
||||
rowC = (v4sf_t *) &CO[4* ldc+J]; \
|
||||
rowC[0] += result[3] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[5*ldc+J]; \
|
||||
rowC[0] += result[2] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[6*ldc+J]; \
|
||||
rowC = (v4sf_t *) &CO[3*ldc+J]; \
|
||||
rowC[0] += result[3] * alpha;
|
||||
#define SAVE_ACC1(ACC, J) \
|
||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||
rowC = (v4sf_t *) &CO[4* ldc+J]; \
|
||||
rowC[0] += result[0] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[5*ldc+J]; \
|
||||
rowC[0] += result[1] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[6*ldc+J]; \
|
||||
rowC[0] += result[2] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[7*ldc+J]; \
|
||||
rowC[0] += result[0] * alpha;
|
||||
rowC[0] += result[3] * alpha;
|
||||
#define SAVE2x4_ACC(ACC, J) \
|
||||
__builtin_mma_disassemble_acc (result, ACC); \
|
||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||
rowC = (v4sf_t *) &CO[0* ldc+J]; \
|
||||
rowC[0] += result[3] * alpha; \
|
||||
rowC[0] += result[0] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[1* ldc+J]; \
|
||||
rowC[0] += result[2] * alpha;
|
||||
rowC[0] += result[1] * alpha;
|
||||
#endif
|
||||
|
||||
#define SET_ACC_ZERO4() \
|
||||
|
|
|
@ -39,8 +39,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "dgemv_n_microk_power8.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#define NBMAX 4096
|
||||
|
|
|
@ -25,15 +25,19 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#if !defined(__VEC__) || !defined(__ALTIVEC__)
|
||||
#include "../arm/gemv_t.c"
|
||||
#else
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define NBMAX 1024
|
||||
//#define PREFETCH 1
|
||||
|
||||
#include <altivec.h>
|
||||
|
||||
#define HAVE_KERNEL4x8_ASM 1
|
||||
|
||||
|
||||
#if defined(HAVE_KERNEL4x8_ASM)
|
||||
static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, double *y, double alpha) {
|
||||
|
||||
|
@ -355,7 +359,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
|
|||
"stxvd2x 39, %[off], %[y] \n\t"
|
||||
"stxvd2x 40, %[off2], %[y] \n\t"
|
||||
|
||||
: [memy] "+m" (*(const double (*)[8])y),
|
||||
: [memy] "+m" (*(double (*)[8])y),
|
||||
[n] "+&r" (n),
|
||||
[a0] "=b" (a0),
|
||||
[a1] "=&b" (a1),
|
||||
|
@ -369,7 +373,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
|
|||
[off2]"=&b" (off2),
|
||||
[temp] "=&b" (tempR)
|
||||
: [memx] "m" (*(const double (*)[n])x),
|
||||
[mem_ap] "m" (*(const double (*)[]) ap),
|
||||
[mem_ap] "m" (*(const double (*)[n*8]) ap),
|
||||
[alpha] "d" (alpha),
|
||||
"[a0]" (ap),
|
||||
[x] "b" (x),
|
||||
|
@ -883,4 +887,5 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
|||
return (0);
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
|
|
|
@ -40,8 +40,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#pragma GCC optimize "O1"
|
||||
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "drot_microk_power8.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_16
|
||||
|
|
|
@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "common.h"
|
||||
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "dscal_microk_power8.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if !defined(HAVE_KERNEL_8)
|
||||
|
||||
|
|
|
@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "common.h"
|
||||
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "dswap_microk_power8.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_KERNEL_32
|
||||
|
||||
|
|
|
@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
*****************************************************************************/
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include <altivec.h>
|
||||
#endif
|
||||
|
||||
#if defined(DOUBLE)
|
||||
|
||||
#define ABS fabs
|
||||
|
@ -37,6 +40,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#endif
|
||||
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
|
||||
/**
|
||||
* Find maximum index
|
||||
* Warning: requirements n>0 and n % 32 == 0
|
||||
|
@ -313,6 +318,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
|||
return index;
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
|
@ -326,12 +332,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
|||
|
||||
BLASLONG n1 = n & -32;
|
||||
#if defined(_CALL_ELF) && (_CALL_ELF == 2)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
|
||||
if (n1 > 0) {
|
||||
|
||||
max = diamax_kernel_32(n1, x, &maxf);
|
||||
|
||||
i = n1;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
while (i < n) {
|
||||
if (ABS(x[i]) > maxf) {
|
||||
|
|
|
@ -37,6 +37,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#endif
|
||||
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
|
||||
/**
|
||||
* Find minimum index
|
||||
* Warning: requirements n>0 and n % 32 == 0
|
||||
|
@ -313,7 +315,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
|||
return index;
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
|
@ -327,12 +329,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
|||
if (inc_x == 1) {
|
||||
|
||||
#if defined(_CALL_ELF) && (_CALL_ELF == 2)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if (n1 > 0) {
|
||||
|
||||
min = diamin_kernel_32(n1, x, &minf);
|
||||
i = n1;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
while (i < n) {
|
||||
if (ABS(x[i]) < minf) {
|
||||
|
|
|
@ -34,6 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
|
||||
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
|
||||
/**
|
||||
* Find maximum index
|
||||
|
@ -299,7 +300,7 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
|||
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
|
@ -317,6 +318,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
if (inc_x == 1) {
|
||||
|
||||
#if defined(_CALL_ELF) && (_CALL_ELF == 2)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
|
||||
BLASLONG n1 = n & -16;
|
||||
if (n1 > 0) {
|
||||
|
||||
|
@ -324,6 +327,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
i = n1;
|
||||
ix = n1 << 1;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
while(i < n)
|
||||
|
|
|
@ -25,13 +25,13 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#define ABS fabs
|
||||
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])
|
||||
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
|
||||
/**
|
||||
* Find minimum index
|
||||
|
@ -296,6 +296,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
|||
return index;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
|
@ -316,6 +317,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
minf = CABS1(x,0); //index will not be incremented
|
||||
|
||||
#if defined(_CALL_ELF) && (_CALL_ELF == 2)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
|
||||
BLASLONG n1 = n & -16;
|
||||
if (n1 > 0) {
|
||||
|
||||
|
@ -323,6 +326,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
i = n1;
|
||||
ix = n1 << 1;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
while(i < n)
|
||||
|
@ -359,5 +363,3 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -47,8 +47,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "sasum_microk_power8.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_32
|
||||
|
|
|
@ -28,8 +28,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#include "common.h"
|
||||
|
||||
#define offset_0 0
|
||||
#define offset_1 16
|
||||
#define offset_2 32
|
||||
#define offset_3 48
|
||||
#define offset_4 64
|
||||
#define offset_5 80
|
||||
#define offset_6 96
|
||||
#define offset_7 112
|
||||
#define offset_8 128
|
||||
#define offset_9 144
|
||||
#define offset_10 160
|
||||
#define offset_11 176
|
||||
#define offset_12 192
|
||||
#define offset_13 208
|
||||
#define offset_14 224
|
||||
#define offset_15 240
|
||||
|
||||
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
|
||||
#ifndef HAVE_KERNEL_8
|
||||
#include <altivec.h>
|
||||
|
@ -37,12 +54,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha)
|
||||
{
|
||||
BLASLONG i = 0;
|
||||
__vector float v_a = {alpha,alpha,alpha,alpha};
|
||||
__vector float * v_y=(__vector float *)y;
|
||||
__vector float * v_x=(__vector float *)x;
|
||||
__vector float v_a __attribute((aligned(16))) = {alpha,alpha,alpha,alpha};
|
||||
__vector float * vptr_y =(__vector float *)y;
|
||||
__vector float * vptr_x =(__vector float *)x;
|
||||
|
||||
for(; i<n/4; i+=16){
|
||||
|
||||
|
||||
register __vector float vy_0 = vec_vsx_ld( offset_0 ,vptr_y ) ;
|
||||
register __vector float vy_1 = vec_vsx_ld( offset_1 ,vptr_y ) ;
|
||||
register __vector float vy_2 = vec_vsx_ld( offset_2 ,vptr_y ) ;
|
||||
register __vector float vy_3 = vec_vsx_ld( offset_3 ,vptr_y ) ;
|
||||
register __vector float vy_4 = vec_vsx_ld( offset_4 ,vptr_y ) ;
|
||||
register __vector float vy_5 = vec_vsx_ld( offset_5 ,vptr_y ) ;
|
||||
register __vector float vy_6 = vec_vsx_ld( offset_6 ,vptr_y ) ;
|
||||
register __vector float vy_7 = vec_vsx_ld( offset_7 ,vptr_y ) ;
|
||||
register __vector float vy_8 = vec_vsx_ld( offset_8 ,vptr_y ) ;
|
||||
register __vector float vy_9 = vec_vsx_ld( offset_9 ,vptr_y ) ;
|
||||
register __vector float vy_10 = vec_vsx_ld( offset_10 ,vptr_y ) ;
|
||||
register __vector float vy_11 = vec_vsx_ld( offset_11 ,vptr_y ) ;
|
||||
register __vector float vy_12 = vec_vsx_ld( offset_12 ,vptr_y ) ;
|
||||
register __vector float vy_13 = vec_vsx_ld( offset_13 ,vptr_y ) ;
|
||||
register __vector float vy_14 = vec_vsx_ld( offset_14 ,vptr_y ) ;
|
||||
register __vector float vy_15 = vec_vsx_ld( offset_15 ,vptr_y ) ;
|
||||
|
||||
register __vector float vx_0 = vec_vsx_ld( offset_0 ,vptr_x ) ;
|
||||
register __vector float vx_1 = vec_vsx_ld( offset_1 ,vptr_x ) ;
|
||||
register __vector float vx_2 = vec_vsx_ld( offset_2 ,vptr_x ) ;
|
||||
register __vector float vx_3 = vec_vsx_ld( offset_3 ,vptr_x ) ;
|
||||
register __vector float vx_4 = vec_vsx_ld( offset_4 ,vptr_x ) ;
|
||||
register __vector float vx_5 = vec_vsx_ld( offset_5 ,vptr_x ) ;
|
||||
register __vector float vx_6 = vec_vsx_ld( offset_6 ,vptr_x ) ;
|
||||
register __vector float vx_7 = vec_vsx_ld( offset_7 ,vptr_x ) ;
|
||||
register __vector float vx_8 = vec_vsx_ld( offset_8 ,vptr_x ) ;
|
||||
register __vector float vx_9 = vec_vsx_ld( offset_9 ,vptr_x ) ;
|
||||
register __vector float vx_10 = vec_vsx_ld( offset_10 ,vptr_x ) ;
|
||||
register __vector float vx_11 = vec_vsx_ld( offset_11 ,vptr_x ) ;
|
||||
register __vector float vx_12 = vec_vsx_ld( offset_12 ,vptr_x ) ;
|
||||
register __vector float vx_13 = vec_vsx_ld( offset_13 ,vptr_x ) ;
|
||||
register __vector float vx_14 = vec_vsx_ld( offset_14 ,vptr_x ) ;
|
||||
register __vector float vx_15 = vec_vsx_ld( offset_15 ,vptr_x ) ;
|
||||
vy_0 += vx_0*v_a;
|
||||
vy_1 += vx_1*v_a;
|
||||
vy_2 += vx_2*v_a;
|
||||
vy_3 += vx_3*v_a;
|
||||
vy_4 += vx_4*v_a;
|
||||
vy_5 += vx_5*v_a;
|
||||
vy_6 += vx_6*v_a;
|
||||
vy_7 += vx_7*v_a;
|
||||
vy_8 += vx_8*v_a;
|
||||
vy_9 += vx_9*v_a;
|
||||
vy_10 += vx_10*v_a;
|
||||
vy_11 += vx_11*v_a;
|
||||
vy_12 += vx_12*v_a;
|
||||
vy_13 += vx_13*v_a;
|
||||
vy_14 += vx_14*v_a;
|
||||
vy_15 += vx_15*v_a;
|
||||
|
||||
vec_vsx_st( vy_0, offset_0 ,vptr_y ) ;
|
||||
vec_vsx_st( vy_1, offset_1 ,vptr_y ) ;
|
||||
vec_vsx_st( vy_2, offset_2 ,vptr_y ) ;
|
||||
vec_vsx_st( vy_3, offset_3 ,vptr_y ) ;
|
||||
vec_vsx_st( vy_4, offset_4 ,vptr_y ) ;
|
||||
vec_vsx_st( vy_5, offset_5 ,vptr_y ) ;
|
||||
vec_vsx_st( vy_6, offset_6 ,vptr_y ) ;
|
||||
vec_vsx_st( vy_7, offset_7 ,vptr_y ) ;
|
||||
vec_vsx_st( vy_8, offset_8 ,vptr_y ) ;
|
||||
vec_vsx_st( vy_9, offset_9 ,vptr_y ) ;
|
||||
vec_vsx_st( vy_10, offset_10 ,vptr_y ) ;
|
||||
vec_vsx_st( vy_11, offset_11 ,vptr_y ) ;
|
||||
vec_vsx_st( vy_12, offset_12 ,vptr_y ) ;
|
||||
vec_vsx_st( vy_13, offset_13 ,vptr_y ) ;
|
||||
vec_vsx_st( vy_14, offset_14 ,vptr_y ) ;
|
||||
vec_vsx_st( vy_15, offset_15 ,vptr_y ) ;
|
||||
|
||||
vptr_x+=16;
|
||||
vptr_y+=16;
|
||||
|
||||
/*
|
||||
|
||||
v_y[i] += v_a * v_x[i];
|
||||
v_y[i+1] += v_a * v_x[i+1];
|
||||
v_y[i+2] += v_a * v_x[i+2];
|
||||
|
@ -59,9 +149,11 @@ static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha)
|
|||
v_y[i+13] += v_a * v_x[i+13];
|
||||
v_y[i+14] += v_a * v_x[i+14];
|
||||
v_y[i+15] += v_a * v_x[i+15];
|
||||
*/
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
|
@ -74,11 +166,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
|||
{
|
||||
|
||||
BLASLONG n1 = n & -64;
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
|
||||
if ( n1 )
|
||||
saxpy_kernel_64(n1, x, y, da);
|
||||
|
||||
i = n1;
|
||||
#endif
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
|
|
|
@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "common.h"
|
||||
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "scopy_microk_power8.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_KERNEL_32
|
||||
|
||||
|
|
|
@ -36,8 +36,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "common.h"
|
||||
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
|
||||
#include "sdot_microk_power8.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_16
|
||||
|
|
|
@ -27,103 +27,103 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "common.h"
|
||||
#include <altivec.h>
|
||||
|
||||
typedef unsigned char vec_t __attribute__ ((vector_size (16)));
|
||||
typedef __vector unsigned char vec_t;
|
||||
typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
|
||||
typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
|
||||
#if defined(TRMMKERNEL)
|
||||
#define SAVE_ACC(ACC, J) \
|
||||
__builtin_mma_disassemble_acc (result, ACC); \
|
||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||
rowC = (v4sf_t *) &CO[0* ldc+J]; \
|
||||
rowC[0] = result[3] * alpha; \
|
||||
rowC[0] = result[0] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[1*ldc+J]; \
|
||||
rowC[0] = result[2] * alpha; \
|
||||
rowC[0] = result[1] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[2*ldc+J]; \
|
||||
rowC[0] = result[1] * alpha; \
|
||||
rowC[0] = result[2] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[3*ldc+J]; \
|
||||
rowC[0] = result[0] * alpha;
|
||||
rowC[0] = result[3] * alpha;
|
||||
#define SAVE_ACC1(ACC, J) \
|
||||
__builtin_mma_disassemble_acc (result, ACC); \
|
||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||
rowC = (v4sf_t *) &CO[4* ldc+J]; \
|
||||
rowC[0] = result[3] * alpha; \
|
||||
rowC[0] = result[0] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[5*ldc+J]; \
|
||||
rowC[0] = result[2] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[6*ldc+J]; \
|
||||
rowC[0] = result[1] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[6*ldc+J]; \
|
||||
rowC[0] = result[2] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[7*ldc+J]; \
|
||||
rowC[0] = result[0] * alpha;
|
||||
rowC[0] = result[3] * alpha;
|
||||
#define SAVE4x2_ACC(ACC, J) \
|
||||
__builtin_mma_disassemble_acc (result, ACC); \
|
||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||
rowC = (v2sf_t *) &CO[0* ldc+J]; \
|
||||
rowC[0] = result[6] * alpha; \
|
||||
rowC[0] = result[0] * alpha; \
|
||||
rowC = (v2sf_t *) &CO[1* ldc+J]; \
|
||||
rowC[0] = result[4] * alpha; \
|
||||
rowC[0] = result[2] * alpha; \
|
||||
rowC = (v2sf_t *) &CO[2* ldc+J]; \
|
||||
rowC[0] = result[2] * alpha; \
|
||||
rowC = (v2sf_t *) &CO[3* ldc+J]; \
|
||||
rowC[0] = result[0] * alpha;
|
||||
#define SAVE4x2_ACC1(ACC, J) \
|
||||
__builtin_mma_disassemble_acc (result, ACC); \
|
||||
rowC = (v2sf_t *) &CO[4* ldc+J]; \
|
||||
rowC[0] = result[6] * alpha; \
|
||||
rowC = (v2sf_t *) &CO[5* ldc+J]; \
|
||||
rowC[0] = result[4] * alpha; \
|
||||
rowC = (v2sf_t *) &CO[6* ldc+J]; \
|
||||
rowC = (v2sf_t *) &CO[3* ldc+J]; \
|
||||
rowC[0] = result[6] * alpha;
|
||||
#define SAVE4x2_ACC1(ACC, J) \
|
||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||
rowC = (v2sf_t *) &CO[4* ldc+J]; \
|
||||
rowC[0] = result[0] * alpha; \
|
||||
rowC = (v2sf_t *) &CO[5* ldc+J]; \
|
||||
rowC[0] = result[2] * alpha; \
|
||||
rowC = (v2sf_t *) &CO[6* ldc+J]; \
|
||||
rowC[0] = result[4] * alpha; \
|
||||
rowC = (v2sf_t *) &CO[7* ldc+J]; \
|
||||
rowC[0] = result[0] * alpha;
|
||||
rowC[0] = result[6] * alpha;
|
||||
#define SAVE2x4_ACC(ACC, J) \
|
||||
__builtin_mma_disassemble_acc (result, ACC); \
|
||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||
rowC = (v4sf_t *) &CO[0* ldc+J]; \
|
||||
rowC[0] = result[3] * alpha; \
|
||||
rowC[0] = result[0] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[1* ldc+J]; \
|
||||
rowC[0] = result[2] * alpha;
|
||||
rowC[0] = result[1] * alpha;
|
||||
#else
|
||||
#define SAVE_ACC(ACC, J) \
|
||||
__builtin_mma_disassemble_acc (result, ACC); \
|
||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||
rowC = (v4sf_t *) &CO[0* ldc+J]; \
|
||||
rowC[0] += result[3] * alpha; \
|
||||
rowC[0] += result[0] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[1*ldc+J]; \
|
||||
rowC[0] += result[2] * alpha; \
|
||||
rowC[0] += result[1] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[2*ldc+J]; \
|
||||
rowC[0] += result[1] * alpha; \
|
||||
rowC[0] += result[2] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[3*ldc+J]; \
|
||||
rowC[0] += result[0] * alpha;
|
||||
rowC[0] += result[3] * alpha;
|
||||
#define SAVE_ACC1(ACC, J) \
|
||||
__builtin_mma_disassemble_acc (result, ACC); \
|
||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||
rowC = (v4sf_t *) &CO[4* ldc+J]; \
|
||||
rowC[0] += result[3] * alpha; \
|
||||
rowC[0] += result[0] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[5*ldc+J]; \
|
||||
rowC[0] += result[2] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[6*ldc+J]; \
|
||||
rowC[0] += result[1] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[6*ldc+J]; \
|
||||
rowC[0] += result[2] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[7*ldc+J]; \
|
||||
rowC[0] += result[0] * alpha;
|
||||
rowC[0] += result[3] * alpha;
|
||||
#define SAVE4x2_ACC(ACC, J) \
|
||||
__builtin_mma_disassemble_acc (result, ACC); \
|
||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||
rowC = (v2sf_t *) &CO[0* ldc+J]; \
|
||||
rowC[0] += result[6] * alpha; \
|
||||
rowC[0] += result[0] * alpha; \
|
||||
rowC = (v2sf_t *) &CO[1* ldc+J]; \
|
||||
rowC[0] += result[4] * alpha; \
|
||||
rowC[0] += result[2] * alpha; \
|
||||
rowC = (v2sf_t *) &CO[2* ldc+J]; \
|
||||
rowC[0] += result[2] * alpha; \
|
||||
rowC = (v2sf_t *) &CO[3* ldc+J]; \
|
||||
rowC[0] += result[0] * alpha;
|
||||
#define SAVE4x2_ACC1(ACC, J) \
|
||||
__builtin_mma_disassemble_acc (result, ACC); \
|
||||
rowC = (v2sf_t *) &CO[4* ldc+J]; \
|
||||
rowC[0] += result[6] * alpha; \
|
||||
rowC = (v2sf_t *) &CO[5* ldc+J]; \
|
||||
rowC[0] += result[4] * alpha; \
|
||||
rowC = (v2sf_t *) &CO[6* ldc+J]; \
|
||||
rowC = (v2sf_t *) &CO[3* ldc+J]; \
|
||||
rowC[0] += result[6] * alpha;
|
||||
#define SAVE4x2_ACC1(ACC, J) \
|
||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||
rowC = (v2sf_t *) &CO[4* ldc+J]; \
|
||||
rowC[0] += result[0] * alpha; \
|
||||
rowC = (v2sf_t *) &CO[5* ldc+J]; \
|
||||
rowC[0] += result[2] * alpha; \
|
||||
rowC = (v2sf_t *) &CO[6* ldc+J]; \
|
||||
rowC[0] += result[4] * alpha; \
|
||||
rowC = (v2sf_t *) &CO[7* ldc+J]; \
|
||||
rowC[0] += result[0] * alpha;
|
||||
rowC[0] += result[6] * alpha;
|
||||
#define SAVE2x4_ACC(ACC, J) \
|
||||
__builtin_mma_disassemble_acc (result, ACC); \
|
||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||
rowC = (v4sf_t *) &CO[0* ldc+J]; \
|
||||
rowC[0] += result[3] * alpha; \
|
||||
rowC[0] += result[0] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[1* ldc+J]; \
|
||||
rowC[0] += result[2] * alpha;
|
||||
rowC[0] += result[1] * alpha;
|
||||
#endif
|
||||
#define KERNEL(i, j) \
|
||||
__builtin_mma_xvf32gerpp (&acc0, rowB[i], rowA[j]); \
|
||||
|
|
|
@ -24,7 +24,10 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
#if !defined(__VEC__) || !defined(__ALTIVEC__)
|
||||
#include "../arm/gemv_n.c"
|
||||
|
||||
#else
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
@ -463,4 +466,5 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
|||
return(0);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
|
|
@ -24,6 +24,10 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
#if !defined(__VEC__) || !defined(__ALTIVEC__)
|
||||
#include "../arm/gemv_t.c"
|
||||
|
||||
#else
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
@ -477,3 +481,4 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
|||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -45,7 +45,7 @@ bfloat16tof32 (bfloat16 f16)
|
|||
#define BF16TOF32(x) x
|
||||
#endif
|
||||
|
||||
typedef unsigned char vec_t __attribute__ ((vector_size (16)));
|
||||
typedef __vector unsigned char vec_t;
|
||||
typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
|
||||
typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
|
||||
|
||||
|
@ -64,54 +64,54 @@ vector char mask =
|
|||
#define MERGE_LOW(x, y) (vec_t) vec_mergel ((vector short)x, (vector short)y)
|
||||
|
||||
#define SAVE_ACC(ACC, J) \
|
||||
__builtin_mma_disassemble_acc (result, ACC); \
|
||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||
rowC = (v4sf_t *) &CO[0* ldc+J]; \
|
||||
rowC[0] += result[3] * alpha; \
|
||||
rowC[0] += result[0] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[1*ldc+J]; \
|
||||
rowC[0] += result[2] * alpha; \
|
||||
rowC[0] += result[1] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[2*ldc+J]; \
|
||||
rowC[0] += result[1] * alpha; \
|
||||
rowC[0] += result[2] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[3*ldc+J]; \
|
||||
rowC[0] += result[0] * alpha;
|
||||
rowC[0] += result[3] * alpha;
|
||||
#define SAVE_ACC1(ACC, J) \
|
||||
__builtin_mma_disassemble_acc (result, ACC); \
|
||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||
rowC = (v4sf_t *) &CO[4* ldc+J]; \
|
||||
rowC[0] += result[3] * alpha; \
|
||||
rowC[0] += result[0] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[5*ldc+J]; \
|
||||
rowC[0] += result[2] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[6*ldc+J]; \
|
||||
rowC[0] += result[1] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[6*ldc+J]; \
|
||||
rowC[0] += result[2] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[7*ldc+J]; \
|
||||
rowC[0] += result[0] * alpha;
|
||||
rowC[0] += result[3] * alpha;
|
||||
#define SAVE4x2_ACC(ACC, J) \
|
||||
__builtin_mma_disassemble_acc (result, ACC); \
|
||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||
rowC = (v2sf_t *) &CO[0* ldc+J]; \
|
||||
rowC[0] += result[6] * alpha; \
|
||||
rowC[0] += result[0] * alpha; \
|
||||
rowC = (v2sf_t *) &CO[1* ldc+J]; \
|
||||
rowC[0] += result[4] * alpha; \
|
||||
rowC[0] += result[2] * alpha; \
|
||||
rowC = (v2sf_t *) &CO[2* ldc+J]; \
|
||||
rowC[0] += result[2] * alpha; \
|
||||
rowC = (v2sf_t *) &CO[3* ldc+J]; \
|
||||
rowC[0] += result[0] * alpha;
|
||||
#define SAVE4x2_ACC1(ACC, J) \
|
||||
__builtin_mma_disassemble_acc (result, ACC); \
|
||||
rowC = (v2sf_t *) &CO[4* ldc+J]; \
|
||||
rowC[0] += result[6] * alpha; \
|
||||
rowC = (v2sf_t *) &CO[5* ldc+J]; \
|
||||
rowC[0] += result[4] * alpha; \
|
||||
rowC = (v2sf_t *) &CO[6* ldc+J]; \
|
||||
rowC = (v2sf_t *) &CO[3* ldc+J]; \
|
||||
rowC[0] += result[6] * alpha;
|
||||
#define SAVE4x2_ACC1(ACC, J) \
|
||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||
rowC = (v2sf_t *) &CO[4* ldc+J]; \
|
||||
rowC[0] += result[0] * alpha; \
|
||||
rowC = (v2sf_t *) &CO[5* ldc+J]; \
|
||||
rowC[0] += result[2] * alpha; \
|
||||
rowC = (v2sf_t *) &CO[6* ldc+J]; \
|
||||
rowC[0] += result[4] * alpha; \
|
||||
rowC = (v2sf_t *) &CO[7* ldc+J]; \
|
||||
rowC[0] += result[0] * alpha;
|
||||
rowC[0] += result[6] * alpha;
|
||||
|
||||
#define MMA __builtin_mma_xvbf16ger2pp
|
||||
|
||||
#define SAVE2x4_ACC(ACC, J) \
|
||||
__builtin_mma_disassemble_acc (result, ACC); \
|
||||
__builtin_mma_disassemble_acc ((void *)result, ACC); \
|
||||
rowC = (v4sf_t *) &CO[0* ldc+J]; \
|
||||
rowC[0] += result[3] * alpha; \
|
||||
rowC[0] += result[0] * alpha; \
|
||||
rowC = (v4sf_t *) &CO[1* ldc+J]; \
|
||||
rowC[0] += result[2] * alpha;
|
||||
rowC[0] += result[1] * alpha;
|
||||
|
||||
#define SET_ACC_ZERO4() \
|
||||
__builtin_mma_xxsetaccz (&acc0); \
|
||||
|
|
|
@ -40,8 +40,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#pragma GCC optimize "O1"
|
||||
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "srot_microk_power8.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_16
|
||||
|
|
|
@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "common.h"
|
||||
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "sscal_microk_power8.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#if !defined(HAVE_KERNEL_16)
|
||||
|
|
|
@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "common.h"
|
||||
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "sswap_microk_power8.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_KERNEL_32
|
||||
|
||||
|
|
|
@ -47,8 +47,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "zasum_microk_power8.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_8
|
||||
|
|
|
@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "zaxpy_microk_power8.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_4
|
||||
|
|
|
@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "common.h"
|
||||
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "zcopy_microk_power8.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_KERNEL_16
|
||||
|
||||
|
|
|
@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "zdot_microk_power8.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_8
|
||||
|
@ -93,9 +95,11 @@ FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG in
|
|||
FLOAT dot[4] = { 0.0, 0.0, 0.0 , 0.0 } ;
|
||||
|
||||
if ( n <= 0 )
|
||||
{
|
||||
{ /*
|
||||
__real__ result = 0.0 ;
|
||||
__imag__ result = 0.0 ;
|
||||
*/
|
||||
result = OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0);
|
||||
return(result);
|
||||
|
||||
}
|
||||
|
@ -149,11 +153,17 @@ FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG in
|
|||
}
|
||||
|
||||
#if !defined(CONJ)
|
||||
/*
|
||||
__real__ result = dot[0] - dot[1];
|
||||
__imag__ result = dot[2] + dot[3];
|
||||
*/
|
||||
result = OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]-dot[1],dot[2]+dot[3]);
|
||||
#else
|
||||
/*
|
||||
__real__ result = dot[0] + dot[1];
|
||||
__imag__ result = dot[2] - dot[3];
|
||||
*/
|
||||
result = OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]+dot[1],dot[2]-dot[3]);
|
||||
|
||||
#endif
|
||||
|
||||
|
|
|
@ -29,6 +29,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
|
||||
#define HAVE_KERNEL_4x4_VEC 1
|
||||
#define HAVE_KERNEL_4x2_VEC 1
|
||||
#define HAVE_KERNEL_4x1_VEC 1
|
||||
|
@ -37,6 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC)
|
||||
#include <altivec.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//
|
||||
#define NBMAX 4096
|
||||
|
|
|
@ -28,10 +28,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "common.h"
|
||||
|
||||
#define NBMAX 4096
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
|
||||
#define HAVE_KERNEL_4x4_VEC 1
|
||||
#define HAVE_KERNEL_4x2_VEC 1
|
||||
#define HAVE_KERNEL_4x1_VEC 1
|
||||
|
||||
#endif
|
||||
#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC)
|
||||
#include <altivec.h>
|
||||
#endif
|
||||
|
|
|
@ -24,6 +24,9 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
#if !defined(__VEC__) || !defined(__ALTIVEC__)
|
||||
#include "../arm/zrot.c"
|
||||
#else
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
@ -262,4 +265,4 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
|||
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
|
|
@ -39,10 +39,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#pragma GCC optimize "O1"
|
||||
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#if defined(DOUBLE)
|
||||
#include "zscal_microk_power8.c"
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_8
|
||||
|
|
|
@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "zswap_microk_power8.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_16
|
||||
|
|
|
@ -1,667 +0,0 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
#ifndef USE_SIMPLE_THREADED_LEVEL3
|
||||
|
||||
//The array of job_t may overflow the stack.
|
||||
//Instead, use malloc to alloc job_t.
|
||||
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
|
||||
#define USE_ALLOC_HEAP
|
||||
#endif
|
||||
|
||||
|
||||
static FLOAT dm1 = -1.;
|
||||
|
||||
#ifndef KERNEL_FUNC
|
||||
#ifndef LOWER
|
||||
#define KERNEL_FUNC SYRK_KERNEL_U
|
||||
#else
|
||||
#define KERNEL_FUNC SYRK_KERNEL_L
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef LOWER
|
||||
#ifndef COMPLEX
|
||||
#define TRSM_KERNEL TRSM_KERNEL_LT
|
||||
#else
|
||||
#define TRSM_KERNEL TRSM_KERNEL_LC
|
||||
#endif
|
||||
#else
|
||||
#ifndef COMPLEX
|
||||
#define TRSM_KERNEL TRSM_KERNEL_RN
|
||||
#else
|
||||
#define TRSM_KERNEL TRSM_KERNEL_RR
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef CACHE_LINE_SIZE
|
||||
#define CACHE_LINE_SIZE 8
|
||||
#endif
|
||||
|
||||
#ifndef DIVIDE_RATE
|
||||
#define DIVIDE_RATE 2
|
||||
#endif
|
||||
|
||||
#ifndef SWITCH_RATIO
|
||||
#define SWITCH_RATIO 2
|
||||
#endif
|
||||
|
||||
#ifndef LOWER
|
||||
#define TRANS
|
||||
#endif
|
||||
|
||||
#ifndef SYRK_LOCAL
|
||||
#if !defined(LOWER) && !defined(TRANS)
|
||||
#define SYRK_LOCAL SYRK_UN
|
||||
#elif !defined(LOWER) && defined(TRANS)
|
||||
#define SYRK_LOCAL SYRK_UT
|
||||
#elif defined(LOWER) && !defined(TRANS)
|
||||
#define SYRK_LOCAL SYRK_LN
|
||||
#else
|
||||
#define SYRK_LOCAL SYRK_LT
|
||||
#endif
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
#ifdef HAVE_C11
|
||||
_Atomic
|
||||
#else
|
||||
volatile
|
||||
#endif
|
||||
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
||||
} job_t;
|
||||
|
||||
|
||||
#ifndef KERNEL_OPERATION
|
||||
#ifndef COMPLEX
|
||||
#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \
|
||||
KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y))
|
||||
#else
|
||||
#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \
|
||||
KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y))
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef ICOPY_OPERATION
|
||||
#ifndef TRANS
|
||||
#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
|
||||
#else
|
||||
#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef OCOPY_OPERATION
|
||||
#ifdef TRANS
|
||||
#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
|
||||
#else
|
||||
#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef S
|
||||
#define S args -> a
|
||||
#endif
|
||||
#ifndef A
|
||||
#define A args -> b
|
||||
#endif
|
||||
#ifndef C
|
||||
#define C args -> c
|
||||
#endif
|
||||
#ifndef LDA
|
||||
#define LDA args -> lda
|
||||
#endif
|
||||
#ifndef N
|
||||
#define N args -> m
|
||||
#endif
|
||||
#ifndef K
|
||||
#define K args -> k
|
||||
#endif
|
||||
|
||||
static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){
|
||||
|
||||
FLOAT *buffer[DIVIDE_RATE];
|
||||
|
||||
BLASLONG k, lda;
|
||||
BLASLONG m_from, m_to;
|
||||
|
||||
FLOAT *alpha;
|
||||
FLOAT *a, *c;
|
||||
job_t *job = (job_t *)args -> common;
|
||||
BLASLONG xxx, bufferside;
|
||||
|
||||
BLASLONG jjs, min_jj;
|
||||
BLASLONG is, min_i, div_n;
|
||||
|
||||
BLASLONG i, current;
|
||||
|
||||
k = K;
|
||||
|
||||
a = (FLOAT *)A;
|
||||
c = (FLOAT *)C;
|
||||
|
||||
lda = LDA;
|
||||
|
||||
alpha = (FLOAT *)args -> alpha;
|
||||
|
||||
m_from = range_n[mypos + 0];
|
||||
m_to = range_n[mypos + 1];
|
||||
|
||||
#if 0
|
||||
fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld\n", mypos, m_from, m_to);
|
||||
#endif
|
||||
|
||||
div_n = (((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
|
||||
|
||||
buffer[0] = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
|
||||
for (i = 1; i < DIVIDE_RATE; i++) {
|
||||
buffer[i] = buffer[i - 1] + GEMM_Q * div_n * COMPSIZE;
|
||||
}
|
||||
|
||||
#ifndef LOWER
|
||||
TRSM_IUNCOPY(k, k, (FLOAT *)S, lda, 0, sb);
|
||||
#else
|
||||
TRSM_OLTCOPY(k, k, (FLOAT *)S, lda, 0, sb);
|
||||
#endif
|
||||
|
||||
for (xxx = m_from, bufferside = 0; xxx < m_to; xxx += div_n, bufferside ++) {
|
||||
|
||||
for(jjs = xxx; jjs < MIN(m_to, xxx + div_n); jjs += min_jj){
|
||||
|
||||
min_jj = MIN(m_to, xxx + div_n) - jjs;
|
||||
|
||||
#ifndef LOWER
|
||||
if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN;
|
||||
#else
|
||||
if (min_jj > GEMM_P) min_jj = GEMM_P;
|
||||
#endif
|
||||
|
||||
#ifndef LOWER
|
||||
OCOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE);
|
||||
|
||||
TRSM_KERNEL (k, min_jj, k, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
sb,
|
||||
buffer[bufferside] + k * (jjs - xxx) * COMPSIZE,
|
||||
a + jjs * lda * COMPSIZE, lda, 0);
|
||||
#else
|
||||
ICOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE);
|
||||
|
||||
TRSM_KERNEL (min_jj, k, k, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
buffer[bufferside] + k * (jjs - xxx) * COMPSIZE,
|
||||
sb,
|
||||
a + jjs * COMPSIZE, lda, 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifndef LOWER
|
||||
for (i = 0; i <= mypos; i++)
|
||||
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
|
||||
#else
|
||||
for (i = mypos; i < args -> nthreads; i++)
|
||||
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
|
||||
#endif
|
||||
|
||||
WMB;
|
||||
}
|
||||
|
||||
min_i = m_to - m_from;
|
||||
|
||||
if (min_i >= GEMM_P * 2) {
|
||||
min_i = GEMM_P;
|
||||
} else
|
||||
if (min_i > GEMM_P) {
|
||||
min_i = (((min_i + 1) / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
|
||||
}
|
||||
|
||||
#ifndef LOWER
|
||||
ICOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa);
|
||||
#else
|
||||
OCOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa);
|
||||
#endif
|
||||
|
||||
current = mypos;
|
||||
|
||||
#ifndef LOWER
|
||||
while (current < args -> nthreads)
|
||||
#else
|
||||
while (current >= 0)
|
||||
#endif
|
||||
{
|
||||
div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
|
||||
|
||||
for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
|
||||
|
||||
/* thread has to wait */
|
||||
if (current != mypos) while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
|
||||
|
||||
KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha,
|
||||
sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
|
||||
c, lda, m_from, xxx);
|
||||
|
||||
if (m_from + min_i >= m_to) {
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
|
||||
WMB;
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef LOWER
|
||||
current ++;
|
||||
#else
|
||||
current --;
|
||||
#endif
|
||||
}
|
||||
|
||||
for(is = m_from + min_i; is < m_to; is += min_i){
|
||||
min_i = m_to - is;
|
||||
|
||||
if (min_i >= GEMM_P * 2) {
|
||||
min_i = GEMM_P;
|
||||
} else
|
||||
if (min_i > GEMM_P) {
|
||||
min_i = (((min_i + 1) / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
|
||||
}
|
||||
|
||||
#ifndef LOWER
|
||||
ICOPY_OPERATION(k, min_i, a, lda, 0, is, sa);
|
||||
#else
|
||||
OCOPY_OPERATION(k, min_i, a, lda, 0, is, sa);
|
||||
#endif
|
||||
|
||||
current = mypos;
|
||||
|
||||
#ifndef LOWER
|
||||
while (current < args -> nthreads)
|
||||
#else
|
||||
while (current >= 0)
|
||||
#endif
|
||||
{
|
||||
div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
|
||||
|
||||
for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
|
||||
|
||||
KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha,
|
||||
sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
|
||||
c, lda, is, xxx);
|
||||
|
||||
if (is + min_i >= m_to) {
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
|
||||
WMB;
|
||||
}
|
||||
}
|
||||
#ifndef LOWER
|
||||
current ++;
|
||||
#else
|
||||
current --;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < args -> nthreads; i++) {
|
||||
if (i != mypos) {
|
||||
for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){
|
||||
|
||||
blas_arg_t newarg;
|
||||
|
||||
#ifndef USE_ALLOC_HEAP
|
||||
job_t job[MAX_CPU_NUMBER];
|
||||
#else
|
||||
job_t * job = NULL;
|
||||
#endif
|
||||
|
||||
blas_queue_t queue[MAX_CPU_NUMBER];
|
||||
|
||||
BLASLONG range[MAX_CPU_NUMBER + 100];
|
||||
|
||||
BLASLONG num_cpu;
|
||||
|
||||
BLASLONG nthreads = args -> nthreads;
|
||||
|
||||
BLASLONG width, i, j, k;
|
||||
BLASLONG n, n_from, n_to;
|
||||
int mode, mask;
|
||||
double dnum;
|
||||
|
||||
#ifndef COMPLEX
|
||||
#ifdef XDOUBLE
|
||||
mode = BLAS_XDOUBLE | BLAS_REAL;
|
||||
mask = MAX(QGEMM_UNROLL_M, QGEMM_UNROLL_N) - 1;
|
||||
#elif defined(DOUBLE)
|
||||
mode = BLAS_DOUBLE | BLAS_REAL;
|
||||
mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1;
|
||||
#elif defined(HALF)
|
||||
mode = BLAS_HALF | BLAS_REAL;
|
||||
mask = MAX(SHGEMM_UNROLL_M, SHGEMM_UNROLL_N) - 1;
|
||||
#else
|
||||
mode = BLAS_SINGLE | BLAS_REAL;
|
||||
mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1;
|
||||
#endif
|
||||
#else
|
||||
#ifdef XDOUBLE
|
||||
mode = BLAS_XDOUBLE | BLAS_COMPLEX;
|
||||
mask = MAX(XGEMM_UNROLL_M, XGEMM_UNROLL_N) - 1;
|
||||
#elif defined(DOUBLE)
|
||||
mode = BLAS_DOUBLE | BLAS_COMPLEX;
|
||||
mask = MAX(ZGEMM_UNROLL_M, ZGEMM_UNROLL_N) - 1;
|
||||
#else
|
||||
mode = BLAS_SINGLE | BLAS_COMPLEX;
|
||||
mask = MAX(CGEMM_UNROLL_M, CGEMM_UNROLL_N) - 1;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
newarg.m = args -> m;
|
||||
newarg.k = args -> k;
|
||||
newarg.a = args -> a;
|
||||
newarg.b = args -> b;
|
||||
newarg.c = args -> c;
|
||||
newarg.lda = args -> lda;
|
||||
newarg.alpha = args -> alpha;
|
||||
|
||||
#ifdef USE_ALLOC_HEAP
|
||||
job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
|
||||
if(job==NULL){
|
||||
fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
|
||||
exit(1);
|
||||
}
|
||||
#endif
|
||||
|
||||
newarg.common = (void *)job;
|
||||
|
||||
n_from = 0;
|
||||
n_to = args -> m;
|
||||
|
||||
#ifndef LOWER
|
||||
|
||||
range[MAX_CPU_NUMBER] = n_to - n_from;
|
||||
range[0] = 0;
|
||||
num_cpu = 0;
|
||||
i = 0;
|
||||
n = n_to - n_from;
|
||||
|
||||
dnum = (double)n * (double)n /(double)nthreads;
|
||||
|
||||
while (i < n){
|
||||
|
||||
if (nthreads - num_cpu > 1) {
|
||||
|
||||
double di = (double)i;
|
||||
|
||||
width = ((((BLASLONG)(sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1));
|
||||
|
||||
if (num_cpu == 0) width = n - (((n - width)/(mask+1)) * (mask+1));
|
||||
|
||||
if ((width > n - i) || (width < mask)) width = n - i;
|
||||
|
||||
} else {
|
||||
width = n - i;
|
||||
}
|
||||
|
||||
range[MAX_CPU_NUMBER - num_cpu - 1] = range[MAX_CPU_NUMBER - num_cpu] - width;
|
||||
|
||||
queue[num_cpu].mode = mode;
|
||||
queue[num_cpu].routine = inner_thread;
|
||||
queue[num_cpu].args = &newarg;
|
||||
queue[num_cpu].range_m = NULL;
|
||||
|
||||
queue[num_cpu].sa = NULL;
|
||||
queue[num_cpu].sb = NULL;
|
||||
queue[num_cpu].next = &queue[num_cpu + 1];
|
||||
|
||||
num_cpu ++;
|
||||
i += width;
|
||||
}
|
||||
|
||||
for (i = 0; i < num_cpu; i ++) queue[i].range_n = &range[MAX_CPU_NUMBER - num_cpu];
|
||||
|
||||
#else
|
||||
|
||||
range[0] = 0;
|
||||
num_cpu = 0;
|
||||
i = 0;
|
||||
n = n_to - n_from;
|
||||
|
||||
dnum = (double)n * (double)n /(double)nthreads;
|
||||
|
||||
while (i < n){
|
||||
|
||||
if (nthreads - num_cpu > 1) {
|
||||
|
||||
double di = (double)i;
|
||||
|
||||
width = ((((BLASLONG)(sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1));
|
||||
|
||||
if ((width > n - i) || (width < mask)) width = n - i;
|
||||
|
||||
} else {
|
||||
width = n - i;
|
||||
}
|
||||
|
||||
range[num_cpu + 1] = range[num_cpu] + width;
|
||||
|
||||
queue[num_cpu].mode = mode;
|
||||
queue[num_cpu].routine = inner_thread;
|
||||
queue[num_cpu].args = &newarg;
|
||||
queue[num_cpu].range_m = NULL;
|
||||
queue[num_cpu].range_n = range;
|
||||
queue[num_cpu].sa = NULL;
|
||||
queue[num_cpu].sb = NULL;
|
||||
queue[num_cpu].next = &queue[num_cpu + 1];
|
||||
|
||||
num_cpu ++;
|
||||
i += width;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
newarg.nthreads = num_cpu;
|
||||
|
||||
if (num_cpu) {
|
||||
|
||||
for (j = 0; j < num_cpu; j++) {
|
||||
for (i = 0; i < num_cpu; i++) {
|
||||
for (k = 0; k < DIVIDE_RATE; k++) {
|
||||
job[j].working[i][CACHE_LINE_SIZE * k] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
queue[0].sa = sa;
|
||||
queue[0].sb = sb;
|
||||
queue[num_cpu - 1].next = NULL;
|
||||
|
||||
exec_blas(num_cpu, queue);
|
||||
}
|
||||
|
||||
#ifdef USE_ALLOC_HEAP
|
||||
free(job);
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) {
|
||||
|
||||
BLASLONG n, bk, i, blocking, lda;
|
||||
BLASLONG info;
|
||||
int mode;
|
||||
blas_arg_t newarg;
|
||||
FLOAT *a;
|
||||
FLOAT alpha[2] = { -ONE, ZERO};
|
||||
|
||||
#ifndef COMPLEX
|
||||
#ifdef XDOUBLE
|
||||
mode = BLAS_XDOUBLE | BLAS_REAL;
|
||||
#elif defined(DOUBLE)
|
||||
mode = BLAS_DOUBLE | BLAS_REAL;
|
||||
#else
|
||||
mode = BLAS_SINGLE | BLAS_REAL;
|
||||
#endif
|
||||
#else
|
||||
#ifdef XDOUBLE
|
||||
mode = BLAS_XDOUBLE | BLAS_COMPLEX;
|
||||
#elif defined(DOUBLE)
|
||||
mode = BLAS_DOUBLE | BLAS_COMPLEX;
|
||||
#else
|
||||
mode = BLAS_SINGLE | BLAS_COMPLEX;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
if (args -> nthreads == 1) {
|
||||
#ifndef LOWER
|
||||
info = POTRF_U_SINGLE(args, NULL, NULL, sa, sb, 0);
|
||||
#else
|
||||
info = POTRF_L_SINGLE(args, NULL, NULL, sa, sb, 0);
|
||||
#endif
|
||||
return info;
|
||||
}
|
||||
|
||||
n = args -> n;
|
||||
a = (FLOAT *)args -> a;
|
||||
lda = args -> lda;
|
||||
|
||||
if (range_n) n = range_n[1] - range_n[0];
|
||||
|
||||
if (n <= GEMM_UNROLL_N * 2) {
|
||||
#ifndef LOWER
|
||||
info = POTRF_U_SINGLE(args, NULL, range_n, sa, sb, 0);
|
||||
#else
|
||||
info = POTRF_L_SINGLE(args, NULL, range_n, sa, sb, 0);
|
||||
#endif
|
||||
return info;
|
||||
}
|
||||
|
||||
newarg.lda = lda;
|
||||
newarg.ldb = lda;
|
||||
newarg.ldc = lda;
|
||||
newarg.alpha = alpha;
|
||||
newarg.beta = NULL;
|
||||
newarg.nthreads = args -> nthreads;
|
||||
|
||||
blocking = ((n / 2 + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N;
|
||||
if (blocking > GEMM_Q) blocking = GEMM_Q;
|
||||
|
||||
for (i = 0; i < n; i += blocking) {
|
||||
bk = n - i;
|
||||
if (bk > blocking) bk = blocking;
|
||||
|
||||
newarg.m = bk;
|
||||
newarg.n = bk;
|
||||
newarg.a = a + (i + i * lda) * COMPSIZE;
|
||||
|
||||
info = CNAME(&newarg, NULL, NULL, sa, sb, 0);
|
||||
if (info) return info + i;
|
||||
|
||||
if (n - i - bk > 0) {
|
||||
#ifndef USE_SIMPLE_THREADED_LEVEL3
|
||||
newarg.m = n - i - bk;
|
||||
newarg.k = bk;
|
||||
#ifndef LOWER
|
||||
newarg.b = a + ( i + (i + bk) * lda) * COMPSIZE;
|
||||
#else
|
||||
newarg.b = a + ((i + bk) + i * lda) * COMPSIZE;
|
||||
#endif
|
||||
newarg.c = a + ((i + bk) + (i + bk) * lda) * COMPSIZE;
|
||||
|
||||
thread_driver(&newarg, sa, sb);
|
||||
#else
|
||||
|
||||
#ifndef LOWER
|
||||
newarg.m = bk;
|
||||
newarg.n = n - i - bk;
|
||||
newarg.a = a + (i + i * lda) * COMPSIZE;
|
||||
newarg.b = a + (i + (i + bk) * lda) * COMPSIZE;
|
||||
|
||||
gemm_thread_n(mode | BLAS_TRANSA_T,
|
||||
&newarg, NULL, NULL, (void *)TRSM_LCUN, sa, sb, args -> nthreads);
|
||||
|
||||
newarg.n = n - i - bk;
|
||||
newarg.k = bk;
|
||||
newarg.a = a + ( i + (i + bk) * lda) * COMPSIZE;
|
||||
newarg.c = a + ((i + bk) + (i + bk) * lda) * COMPSIZE;
|
||||
|
||||
#if 0
|
||||
HERK_THREAD_UC(&newarg, NULL, NULL, sa, sb, 0);
|
||||
#else
|
||||
syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T,
|
||||
&newarg, NULL, NULL, (void *)HERK_UC, sa, sb, args -> nthreads);
|
||||
#endif
|
||||
#else
|
||||
newarg.m = n - i - bk;
|
||||
newarg.n = bk;
|
||||
newarg.a = a + (i + i * lda) * COMPSIZE;
|
||||
newarg.b = a + (i + bk + i * lda) * COMPSIZE;
|
||||
|
||||
gemm_thread_m(mode | BLAS_RSIDE | BLAS_TRANSA_T | BLAS_UPLO,
|
||||
&newarg, NULL, NULL, (void *)TRSM_RCLN, sa, sb, args -> nthreads);
|
||||
|
||||
newarg.n = n - i - bk;
|
||||
newarg.k = bk;
|
||||
newarg.a = a + (i + bk + i * lda) * COMPSIZE;
|
||||
newarg.c = a + (i + bk + (i + bk) * lda) * COMPSIZE;
|
||||
|
||||
#if 0
|
||||
HERK_THREAD_LN(&newarg, NULL, NULL, sa, sb, 0);
|
||||
#else
|
||||
syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T | BLAS_UPLO,
|
||||
&newarg, NULL, NULL, (void *)HERK_LN, sa, sb, args -> nthreads);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#endif
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
|
@ -101,7 +101,12 @@ static FLOAT dm1 = -1.;
|
|||
#endif
|
||||
|
||||
typedef struct {
|
||||
volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
||||
#ifdef HAVE_C11
|
||||
_Atomic
|
||||
#else
|
||||
volatile
|
||||
#endif
|
||||
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
||||
} job_t;
|
||||
|
||||
|
||||
|
@ -375,6 +380,9 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){
|
|||
#elif defined(DOUBLE)
|
||||
mode = BLAS_DOUBLE | BLAS_REAL;
|
||||
mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1;
|
||||
#elif defined(HALF)
|
||||
mode = BLAS_HALF | BLAS_REAL;
|
||||
mask = MAX(SHGEMM_UNROLL_M, SHGEMM_UNROLL_N) - 1;
|
||||
#else
|
||||
mode = BLAS_SINGLE | BLAS_REAL;
|
||||
mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1;
|
||||
|
|
43
param.h
43
param.h
|
@ -2225,7 +2225,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define GEMM_DEFAULT_OFFSET_A 0
|
||||
#define GEMM_DEFAULT_OFFSET_B 65536
|
||||
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
|
||||
|
||||
#if defined(__32BIT__)
|
||||
#warning using BINARY32==POWER6
|
||||
#define SGEMM_DEFAULT_UNROLL_M 4
|
||||
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||
#define DGEMM_DEFAULT_UNROLL_M 4
|
||||
#define DGEMM_DEFAULT_UNROLL_N 4
|
||||
#define CGEMM_DEFAULT_UNROLL_M 2
|
||||
#define CGEMM_DEFAULT_UNROLL_N 4
|
||||
#define ZGEMM_DEFAULT_UNROLL_M 2
|
||||
#define ZGEMM_DEFAULT_UNROLL_N 4
|
||||
#else
|
||||
#define SGEMM_DEFAULT_UNROLL_M 16
|
||||
#define SGEMM_DEFAULT_UNROLL_N 8
|
||||
#define DGEMM_DEFAULT_UNROLL_M 16
|
||||
|
@ -2234,7 +2244,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define CGEMM_DEFAULT_UNROLL_N 4
|
||||
#define ZGEMM_DEFAULT_UNROLL_M 8
|
||||
#define ZGEMM_DEFAULT_UNROLL_N 2
|
||||
|
||||
#endif
|
||||
#define SGEMM_DEFAULT_P 1280UL
|
||||
#define DGEMM_DEFAULT_P 640UL
|
||||
#define CGEMM_DEFAULT_P 640UL
|
||||
|
@ -2769,6 +2779,35 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
|
|||
#define CGEMM_DEFAULT_R 4096
|
||||
#define ZGEMM_DEFAULT_R 4096
|
||||
|
||||
#elif defined(THUNDERX3T110)
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 16
|
||||
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||
|
||||
#define DGEMM_DEFAULT_UNROLL_M 8
|
||||
#define DGEMM_DEFAULT_UNROLL_N 4
|
||||
|
||||
#define CGEMM_DEFAULT_UNROLL_M 8
|
||||
#define CGEMM_DEFAULT_UNROLL_N 4
|
||||
|
||||
#define ZGEMM_DEFAULT_UNROLL_M 4
|
||||
#define ZGEMM_DEFAULT_UNROLL_N 4
|
||||
|
||||
#define SGEMM_DEFAULT_P 128
|
||||
#define DGEMM_DEFAULT_P 320
|
||||
#define CGEMM_DEFAULT_P 128
|
||||
#define ZGEMM_DEFAULT_P 128
|
||||
|
||||
#define SGEMM_DEFAULT_Q 352
|
||||
#define DGEMM_DEFAULT_Q 128
|
||||
#define CGEMM_DEFAULT_Q 224
|
||||
#define ZGEMM_DEFAULT_Q 112
|
||||
|
||||
#define SGEMM_DEFAULT_R 4096
|
||||
#define DGEMM_DEFAULT_R 4096
|
||||
#define CGEMM_DEFAULT_R 4096
|
||||
#define ZGEMM_DEFAULT_R 4096
|
||||
|
||||
#elif defined(NEOVERSEN1)
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 16
|
||||
|
|
Loading…
Reference in New Issue