Merge pull request #74 from xianyi/develop

rebase
This commit is contained in:
Martin Kroeker 2020-07-30 01:04:09 +02:00 committed by GitHub
commit 525db5401c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
63 changed files with 786 additions and 838 deletions

View File

@ -56,6 +56,16 @@ CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
endif endif
ifeq ($(CORE), THUNDERX3T110)
ifeq ($(GCCVERSIONGTEQ10), 1)
CCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110
FCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110
else
CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
endif
endif
ifeq ($(GCCVERSIONGTEQ9), 1) ifeq ($(GCCVERSIONGTEQ9), 1)
ifeq ($(CORE), TSV110) ifeq ($(CORE), TSV110)
CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110

View File

@ -11,34 +11,34 @@ endif
ifeq ($(CORE), POWER10) ifeq ($(CORE), POWER10)
ifeq ($(USE_OPENMP), 1) ifeq ($(USE_OPENMP), 1)
COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -DUSE_OPENMP -fno-fast-math -fopenmp
FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -DUSE_OPENMP -fno-fast-math -fopenmp
else else
COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -malign-power -fno-fast-math COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -malign-power -fno-fast-math FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math
endif endif
endif endif
ifeq ($(CORE), POWER9) ifeq ($(CORE), POWER9)
ifeq ($(USE_OPENMP), 1) ifeq ($(USE_OPENMP), 1)
ifneq ($(C_COMPILER), PGI) ifneq ($(C_COMPILER), PGI)
CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -DUSE_OPENMP -fno-fast-math -fopenmp
else else
CCOMMON_OPT += -fast -Mvect=simd -Mcache_align -DUSE_OPENMP -mp CCOMMON_OPT += -fast -Mvect=simd -Mcache_align -DUSE_OPENMP -mp
endif endif
ifneq ($(F_COMPILER), PGI) ifneq ($(F_COMPILER), PGI)
FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -DUSE_OPENMP -fno-fast-math -fopenmp
else else
FCOMMON_OPT += -O2 -Mrecursive -DUSE_OPENMP -mp FCOMMON_OPT += -O2 -Mrecursive -DUSE_OPENMP -mp
endif endif
else else
ifneq ($(C_COMPILER), PGI) ifneq ($(C_COMPILER), PGI)
CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -fno-fast-math CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math
else else
CCOMMON_OPT += -fast -Mvect=simd -Mcache_align CCOMMON_OPT += -fast -Mvect=simd -Mcache_align
endif endif
ifneq ($(F_COMPILER), PGI) ifneq ($(F_COMPILER), PGI)
FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -fno-fast-math FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -fno-fast-math
else else
FCOMMON_OPT += -O2 -Mrecursive FCOMMON_OPT += -O2 -Mrecursive
endif endif
@ -48,26 +48,26 @@ endif
ifeq ($(CORE), POWER8) ifeq ($(CORE), POWER8)
ifeq ($(USE_OPENMP), 1) ifeq ($(USE_OPENMP), 1)
ifneq ($(C_COMPILER), PGI) ifneq ($(C_COMPILER), PGI)
CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -DUSE_OPENMP -fno-fast-math -fopenmp
else else
CCOMMON_OPT += -fast -Mvect=simd -Mcache_align -DUSE_OPENMP -mp CCOMMON_OPT += -fast -Mvect=simd -Mcache_align -DUSE_OPENMP -mp
endif endif
ifneq ($(F_COMPILER), PGI) ifneq ($(F_COMPILER), PGI)
FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -DUSE_OPENMP -fno-fast-math -fopenmp
else else
FCOMMON_OPT += -O2 -Mrecursive -DUSE_OPENMP -mp FCOMMON_OPT += -O2 -Mrecursive -DUSE_OPENMP -mp
endif endif
else else
ifneq ($(C_COMPILER), PGI) ifneq ($(C_COMPILER), PGI)
CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -fno-fast-math CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math
else else
CCOMMON_OPT += -fast -Mvect=simd -Mcache_align CCOMMON_OPT += -fast -Mvect=simd -Mcache_align
endif endif
ifneq ($(F_COMPILER), PGI) ifneq ($(F_COMPILER), PGI)
ifeq ($(OSNAME), AIX) ifeq ($(OSNAME), AIX)
FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math
else else
FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math
endif endif
else else
FCOMMON_OPT += -O2 -Mrecursive FCOMMON_OPT += -O2 -Mrecursive

View File

@ -578,6 +578,7 @@ DYNAMIC_CORE += THUNDERX
DYNAMIC_CORE += THUNDERX2T99 DYNAMIC_CORE += THUNDERX2T99
DYNAMIC_CORE += TSV110 DYNAMIC_CORE += TSV110
DYNAMIC_CORE += EMAG8180 DYNAMIC_CORE += EMAG8180
DYNAMIC_CORE += THUNDERX3T110
endif endif
ifeq ($(ARCH), zarch) ifeq ($(ARCH), zarch)
@ -617,7 +618,6 @@ DYNAMIC_CORE += POWER8
ifneq ($(C_COMPILER), GCC) ifneq ($(C_COMPILER), GCC)
DYNAMIC_CORE += POWER9 DYNAMIC_CORE += POWER9
DYNAMIC_CORE += POWER10 DYNAMIC_CORE += POWER10
override LDFLAGS += -Wl,-no-power10-stubs
endif endif
ifeq ($(C_COMPILER), GCC) ifeq ($(C_COMPILER), GCC)
ifeq ($(GCCVERSIONGT5), 1) ifeq ($(GCCVERSIONGT5), 1)
@ -627,11 +627,9 @@ $(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.)
endif endif
ifeq ($(GCCVERSIONGTEQ11), 1) ifeq ($(GCCVERSIONGTEQ11), 1)
DYNAMIC_CORE += POWER10 DYNAMIC_CORE += POWER10
override LDFLAGS += -Wl,-no-power10-stubs
else ifeq ($(GCCVERSIONGTEQ10), 1) else ifeq ($(GCCVERSIONGTEQ10), 1)
ifeq ($(GCCMINORVERSIONGTEQ2), 1) ifeq ($(GCCMINORVERSIONGTEQ2), 1)
DYNAMIC_CORE += POWER10 DYNAMIC_CORE += POWER10
override LDFLAGS += -Wl,-no-power10-stubs
endif endif
else else
$(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.) $(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.)
@ -1241,7 +1239,9 @@ KERNELDIR = $(TOPDIR)/kernel/$(ARCH)
include $(TOPDIR)/Makefile.$(ARCH) include $(TOPDIR)/Makefile.$(ARCH)
ifneq ($(C_COMPILER), PGI)
CCOMMON_OPT += -UASMNAME -UASMFNAME -UNAME -UCNAME -UCHAR_NAME -UCHAR_CNAME CCOMMON_OPT += -UASMNAME -UASMFNAME -UNAME -UCNAME -UCHAR_NAME -UCHAR_CNAME
endif
CCOMMON_OPT += -DASMNAME=$(FU)$(*F) -DASMFNAME=$(FU)$(*F)$(BU) -DNAME=$(*F)$(BU) -DCNAME=$(*F) -DCHAR_NAME=\"$(*F)$(BU)\" -DCHAR_CNAME=\"$(*F)\" CCOMMON_OPT += -DASMNAME=$(FU)$(*F) -DASMFNAME=$(FU)$(*F)$(BU) -DNAME=$(*F)$(BU) -DCNAME=$(*F) -DCHAR_NAME=\"$(*F)$(BU)\" -DCHAR_CNAME=\"$(*F)\"
ifeq ($(CORE), PPC440) ifeq ($(CORE), PPC440)

View File

@ -28,7 +28,8 @@ You can download them from [file hosting on sourceforge.net](https://sourceforge
## Installation from Source ## Installation from Source
Download from project homepage, https://xianyi.github.com/OpenBLAS/, or check out the code Download from project homepage, https://xianyi.github.com/OpenBLAS/, or check out the code
using Git from https://github.com/xianyi/OpenBLAS.git. using Git from https://github.com/xianyi/OpenBLAS.git. (If you want the most up to date version, be
sure to use the develop branch - master is several years out of date due to a change of maintainership.)
Buildtime parameters can be chosen in Makefile.rule, see there for a short description of each option. Buildtime parameters can be chosen in Makefile.rule, see there for a short description of each option.
Most can also be given directly on the make or cmake command line. Most can also be given directly on the make or cmake command line.

View File

@ -96,6 +96,7 @@ FALKOR
THUNDERX THUNDERX
THUNDERX2T99 THUNDERX2T99
TSV110 TSV110
THUNDERX3T110
9.System Z: 9.System Z:
ZARCH_GENERIC ZARCH_GENERIC

View File

@ -45,7 +45,7 @@ endif ()
if (DYNAMIC_ARCH) if (DYNAMIC_ARCH)
if (ARM64) if (ARM64)
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1) set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
endif () endif ()
if (POWER) if (POWER)

View File

@ -195,8 +195,13 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
"#define HAVE_VFP\n" "#define HAVE_VFP\n"
"#define HAVE_NEON\n" "#define HAVE_NEON\n"
"#define ARMV8\n") "#define ARMV8\n")
if ("${TCORE}" STREQUAL "CORTEXA57")
set(SGEMM_UNROLL_M 16) set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 4) set(SGEMM_UNROLL_N 4)
else ()
set(SGEMM_UNROLL_M 8)
set(SGEMM_UNROLL_N 8)
endif ()
set(DGEMM_UNROLL_M 8) set(DGEMM_UNROLL_M 8)
set(DGEMM_UNROLL_N 4) set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 8) set(CGEMM_UNROLL_M 8)
@ -338,6 +343,33 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4) set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16) set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "THUNDERX3T110")
file(APPEND ${TARGET_CONF_TEMP}
"#define THUNDERX3T110\n"
"#define L1_CODE_SIZE\t65536\n"
"#define L1_CODE_LINESIZE\t64\n"
"#define L1_CODE_ASSOCIATIVE\t8\n"
"#define L1_DATA_SIZE\t65536\n"
"#define L1_DATA_LINESIZE\t64\n"
"#define L1_DATA_ASSOCIATIVE\t8\n"
"#define L2_SIZE\t524288\n"
"#define L2_LINESIZE\t64\n"
"#define L2_ASSOCIATIVE\t8\n"
"#define L3_SIZE\t94371840\n"
"#define L3_LINESIZE\t64\n"
"#define L3_ASSOCIATIVE\t32\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n"
"#define ARMV8\n")
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 8)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 8)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "TSV110") elseif ("${TCORE}" STREQUAL "TSV110")
file(APPEND ${TARGET_CONF_TEMP} file(APPEND ${TARGET_CONF_TEMP}
"#define ARMV8\n" "#define ARMV8\n"

View File

@ -40,6 +40,7 @@
// Cavium // Cavium
#define CPU_THUNDERX 7 #define CPU_THUNDERX 7
#define CPU_THUNDERX2T99 8 #define CPU_THUNDERX2T99 8
#define CPU_THUNDERX3T110 12
//Hisilicon //Hisilicon
#define CPU_TSV110 9 #define CPU_TSV110 9
// Ampere // Ampere
@ -57,7 +58,8 @@ static char *cpuname[] = {
"THUNDERX2T99", "THUNDERX2T99",
"TSV110", "TSV110",
"EMAG8180", "EMAG8180",
"NEOVERSEN1" "NEOVERSEN1",
"THUNDERX3T110"
}; };
static char *cpuname_lower[] = { static char *cpuname_lower[] = {
@ -72,7 +74,8 @@ static char *cpuname_lower[] = {
"thunderx2t99", "thunderx2t99",
"tsv110", "tsv110",
"emag8180", "emag8180",
"neoversen1" "neoversen1",
"thunderx3t110"
}; };
int get_feature(char *search) int get_feature(char *search)
@ -158,6 +161,8 @@ int detect(void)
return CPU_THUNDERX; return CPU_THUNDERX;
else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0af")) else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0af"))
return CPU_THUNDERX2T99; return CPU_THUNDERX2T99;
else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0b8"))
return CPU_THUNDERX3T110;
// HiSilicon // HiSilicon
else if (strstr(cpu_implementer, "0x48") && strstr(cpu_part, "0xd01")) else if (strstr(cpu_implementer, "0x48") && strstr(cpu_part, "0xd01"))
return CPU_TSV110; return CPU_TSV110;
@ -372,7 +377,25 @@ void get_cpuconfig(void)
printf("#define L2_LINESIZE 64\n"); printf("#define L2_LINESIZE 64\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n"); printf("#define DTB_SIZE 4096\n");
break;
case CPU_THUNDERX3T110:
printf("#define THUNDERX3T110 \n");
printf("#define L1_CODE_SIZE 65536 \n");
printf("#define L1_CODE_LINESIZE 64 \n");
printf("#define L1_CODE_ASSOCIATIVE 8 \n");
printf("#define L1_DATA_SIZE 32768 \n");
printf("#define L1_DATA_LINESIZE 64 \n");
printf("#define L1_DATA_ASSOCIATIVE 8 \n");
printf("#define L2_SIZE 524288 \n");
printf("#define L2_LINESIZE 64 \n");
printf("#define L2_ASSOCIATIVE 8 \n");
printf("#define L3_SIZE 94371840 \n");
printf("#define L3_LINESIZE 64 \n");
printf("#define L3_ASSOCIATIVE 32 \n");
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
printf("#define DTB_SIZE 4096 \n");
break;
} }
get_cpucount(); get_cpucount();
} }

View File

@ -1454,10 +1454,11 @@ int get_cpuname(void){
return CPUTYPE_OPTERON; return CPUTYPE_OPTERON;
case 1: case 1:
case 3: case 3:
case 7: // case 7:
case 10: // case 10:
return CPUTYPE_BARCELONA; return CPUTYPE_BARCELONA;
case 5: case 5:
case 7:
return CPUTYPE_BOBCAT; return CPUTYPE_BOBCAT;
case 6: case 6:
switch (model) { switch (model) {
@ -1507,6 +1508,8 @@ int get_cpuname(void){
// AMD Ryzen // AMD Ryzen
case 8: case 8:
// AMD Ryzen2 // AMD Ryzen2
default:
// Matisse/Renoir and other recent Ryzen2
if(support_avx()) if(support_avx())
#ifndef NO_AVX2 #ifndef NO_AVX2
return CPUTYPE_ZEN; return CPUTYPE_ZEN;
@ -1516,6 +1519,16 @@ int get_cpuname(void){
else else
return CPUTYPE_BARCELONA; return CPUTYPE_BARCELONA;
} }
break;
case 10: // Zen3
if(support_avx())
#ifndef NO_AVX2
return CPUTYPE_ZEN;
#else
return CPUTYPE_SANDYBRIDGE; // Zen is closer in architecture to Sandy Bridge than to Excavator
#endif
else
return CPUTYPE_BARCELONA;
} }
break; break;
} }
@ -2107,7 +2120,7 @@ int get_coretype(void){
return CORE_PILEDRIVER; return CORE_PILEDRIVER;
else else
return CORE_BARCELONA; //OS don't support AVX. return CORE_BARCELONA; //OS don't support AVX.
case 5: // New EXCAVATOR case 5: // New EXCAVATOR
if(support_avx()) if(support_avx())
return CORE_EXCAVATOR; return CORE_EXCAVATOR;
else else
@ -2135,12 +2148,14 @@ int get_coretype(void){
} }
break; break;
} }
} else if (exfamily == 8) { } else if (exfamily == 8 || exfamily == 10) {
switch (model) { switch (model) {
case 1: case 1:
// AMD Ryzen // AMD Ryzen
case 8: case 8:
// Ryzen 2 // Ryzen 2
default:
// Matisse,Renoir Ryzen2 models
if(support_avx()) if(support_avx())
#ifndef NO_AVX2 #ifndef NO_AVX2
return CORE_ZEN; return CORE_ZEN;

View File

@ -656,7 +656,7 @@ static gotoblas_t *get_coretype(void){
if ((exfamily == 0) || (exfamily == 2)) { if ((exfamily == 0) || (exfamily == 2)) {
if (ecx & (1 << 0)) return &gotoblas_OPTERON_SSE3; if (ecx & (1 << 0)) return &gotoblas_OPTERON_SSE3;
else return &gotoblas_OPTERON; else return &gotoblas_OPTERON;
} else if (exfamily == 5) { } else if (exfamily == 5 || exfamily == 7) {
return &gotoblas_BOBCAT; return &gotoblas_BOBCAT;
} else if (exfamily == 6) { } else if (exfamily == 6) {
if(model == 1){ if(model == 1){
@ -710,7 +710,7 @@ static gotoblas_t *get_coretype(void){
} }
} }
} else if (exfamily == 8) { } else if (exfamily == 8) {
if (model == 1 || model == 8) { /* if (model == 1 || model == 8) */ {
if(support_avx()) if(support_avx())
return &gotoblas_ZEN; return &gotoblas_ZEN;
else{ else{
@ -724,10 +724,18 @@ static gotoblas_t *get_coretype(void){
else{ else{
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
} }
} else if (exfamily == 10) {
if(support_avx())
return &gotoblas_ZEN;
else{
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}else { }else {
return &gotoblas_BARCELONA; return &gotoblas_BARCELONA;
} }
} }
} }

View File

@ -53,10 +53,11 @@ extern gotoblas_t gotoblas_THUNDERX2T99;
extern gotoblas_t gotoblas_TSV110; extern gotoblas_t gotoblas_TSV110;
extern gotoblas_t gotoblas_EMAG8180; extern gotoblas_t gotoblas_EMAG8180;
extern gotoblas_t gotoblas_NEOVERSEN1; extern gotoblas_t gotoblas_NEOVERSEN1;
extern gotoblas_t gotoblas_THUNDERX3T110;
extern void openblas_warning(int verbose, const char * msg); extern void openblas_warning(int verbose, const char * msg);
#define NUM_CORETYPES 11 #define NUM_CORETYPES 12
/* /*
* In case asm/hwcap.h is outdated on the build system, make sure * In case asm/hwcap.h is outdated on the build system, make sure
@ -82,6 +83,7 @@ static char *corename[] = {
"tsv110", "tsv110",
"emag8180", "emag8180",
"neoversen1", "neoversen1",
"thunderx3t110",
"unknown" "unknown"
}; };
@ -97,6 +99,7 @@ char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_TSV110) return corename[ 8]; if (gotoblas == &gotoblas_TSV110) return corename[ 8];
if (gotoblas == &gotoblas_EMAG8180) return corename[ 9]; if (gotoblas == &gotoblas_EMAG8180) return corename[ 9];
if (gotoblas == &gotoblas_NEOVERSEN1) return corename[10]; if (gotoblas == &gotoblas_NEOVERSEN1) return corename[10];
if (gotoblas == &gotoblas_THUNDERX3T110) return corename[11];
return corename[NUM_CORETYPES]; return corename[NUM_CORETYPES];
} }
@ -127,6 +130,7 @@ static gotoblas_t *force_coretype(char *coretype) {
case 8: return (&gotoblas_TSV110); case 8: return (&gotoblas_TSV110);
case 9: return (&gotoblas_EMAG8180); case 9: return (&gotoblas_EMAG8180);
case 10: return (&gotoblas_NEOVERSEN1); case 10: return (&gotoblas_NEOVERSEN1);
case 11: return (&gotoblas_THUNDERX3T110);
} }
snprintf(message, 128, "Core not found: %s\n", coretype); snprintf(message, 128, "Core not found: %s\n", coretype);
openblas_warning(1, message); openblas_warning(1, message);
@ -190,6 +194,8 @@ static gotoblas_t *get_coretype(void) {
return &gotoblas_THUNDERX; return &gotoblas_THUNDERX;
case 0x0af: // ThunderX2 case 0x0af: // ThunderX2
return &gotoblas_THUNDERX2T99; return &gotoblas_THUNDERX2T99;
case 0x0b8: // ThunderX3
return &gotoblas_THUNDERX3T110;
} }
break; break;
case 0x48: // HiSilicon case 0x48: // HiSilicon

View File

@ -1174,6 +1174,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "EMAG8180" #define CORENAME "EMAG8180"
#endif #endif
#ifdef FORCE_THUNDERX3T110
#define ARMV8
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "THUNDERX3T110"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DTHUNDERX3T110 " \
"-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \
"-DL2_SIZE=524288 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \
"-DL3_SIZE=94371840 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "thunderx3t110"
#define CORENAME "THUNDERX3T110"
#else
#endif
#ifdef FORCE_ZARCH_GENERIC #ifdef FORCE_ZARCH_GENERIC
#define FORCE #define FORCE
#define ARCHITECTURE "ZARCH" #define ARCHITECTURE "ZARCH"

View File

@ -42,7 +42,7 @@
#include "functable.h" #include "functable.h"
#endif #endif
#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) #if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) || defined(THUNDERX3T110)
// Multithreaded swap gives performance benefits in ThunderX2T99 // Multithreaded swap gives performance benefits in ThunderX2T99
#else #else
// Disable multi-threading as it does not show any performance // Disable multi-threading as it does not show any performance

View File

@ -42,7 +42,7 @@
#include "functable.h" #include "functable.h"
#endif #endif
#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) #if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) || defined(THUNDERX3T110)
// Multithreaded swap gives performance benefits in ThunderX2T99 // Multithreaded swap gives performance benefits in ThunderX2T99
#else #else
// Disable multi-threading as it does not show any performance // Disable multi-threading as it does not show any performance

View File

@ -10,6 +10,11 @@ ifeq ($(C_COMPILER), GCC)
GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
endif endif
ifeq ($(ARCH), power)
ifeq ($(C_COMPILER), CLANG)
override CFLAGS += -fno-integrated-as
endif
endif
AVX2OPT = AVX2OPT =
ifeq ($(C_COMPILER), GCC) ifeq ($(C_COMPILER), GCC)
# AVX2 support was added in 4.7.0 # AVX2 support was added in 4.7.0

View File

@ -44,8 +44,10 @@ USE_TRMM = 1
endif endif
ifeq ($(CORE), POWER8) ifeq ($(CORE), POWER8)
ifeq ($(BINARY64),1)
USE_TRMM = 1 USE_TRMM = 1
endif endif
endif
ifeq ($(CORE), POWER9) ifeq ($(CORE), POWER9)
USE_TRMM = 1 USE_TRMM = 1

View File

@ -48,10 +48,12 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
dot[0]=0.0; dot[0]=0.0;
dot[1]=0.0; dot[1]=0.0;
#if !defined(__PPC__)
CREAL(result) = 0.0 ; CREAL(result) = 0.0 ;
CIMAG(result) = 0.0 ; CIMAG(result) = 0.0 ;
#else
result = OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0);
#endif
if ( n < 1 ) return(result); if ( n < 1 ) return(result);
inc_x2 = 2 * inc_x ; inc_x2 = 2 * inc_x ;
@ -71,8 +73,12 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
i++ ; i++ ;
} }
CREAL(result) = dot[0]; #if !defined(__POWER__)
CREAL(result) = dot[0];
CIMAG(result) = dot[1]; CIMAG(result) = dot[1];
#else
result = OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0],dot[1]);
#endif
return(result); return(result);
} }

View File

@ -0,0 +1,184 @@
SAMINKERNEL = ../arm/amin.c
DAMINKERNEL = ../arm/amin.c
CAMINKERNEL = ../arm/zamin.c
ZAMINKERNEL = ../arm/zamin.c
SMAXKERNEL = ../arm/max.c
DMAXKERNEL = ../arm/max.c
SMINKERNEL = ../arm/min.c
DMINKERNEL = ../arm/min.c
ISAMINKERNEL = ../arm/iamin.c
IDAMINKERNEL = ../arm/iamin.c
ICAMINKERNEL = ../arm/izamin.c
IZAMINKERNEL = ../arm/izamin.c
ISMAXKERNEL = ../arm/imax.c
IDMAXKERNEL = ../arm/imax.c
ISMINKERNEL = ../arm/imin.c
IDMINKERNEL = ../arm/imin.c
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
SAMAXKERNEL = amax.S
DAMAXKERNEL = amax.S
CAMAXKERNEL = zamax.S
ZAMAXKERNEL = zamax.S
SAXPYKERNEL = axpy.S
DAXPYKERNEL = daxpy_thunderx2t99.S
CAXPYKERNEL = zaxpy.S
ZAXPYKERNEL = zaxpy.S
SROTKERNEL = rot.S
DROTKERNEL = rot.S
CROTKERNEL = zrot.S
ZROTKERNEL = zrot.S
SSCALKERNEL = scal.S
DSCALKERNEL = scal.S
CSCALKERNEL = zscal.S
ZSCALKERNEL = zscal.S
SGEMVNKERNEL = gemv_n.S
DGEMVNKERNEL = gemv_n.S
CGEMVNKERNEL = zgemv_n.S
ZGEMVNKERNEL = zgemv_n.S
SGEMVTKERNEL = gemv_t.S
DGEMVTKERNEL = gemv_t.S
CGEMVTKERNEL = zgemv_t.S
ZGEMVTKERNEL = zgemv_t.S
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
ifeq ($(DGEMM_UNROLL_M), 8)
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
else
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
endif
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
ifeq ($(DGEMM_UNROLL_N), 4)
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
else
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
endif
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
SASUMKERNEL = sasum_thunderx2t99.c
DASUMKERNEL = dasum_thunderx2t99.c
CASUMKERNEL = casum_thunderx2t99.c
ZASUMKERNEL = zasum_thunderx2t99.c
SCOPYKERNEL = copy_thunderx2t99.c
DCOPYKERNEL = copy_thunderx2t99.c
CCOPYKERNEL = copy_thunderx2t99.c
ZCOPYKERNEL = copy_thunderx2t99.c
SSWAPKERNEL = swap_thunderx2t99.S
DSWAPKERNEL = swap_thunderx2t99.S
CSWAPKERNEL = swap_thunderx2t99.S
ZSWAPKERNEL = swap_thunderx2t99.S
ISAMAXKERNEL = iamax_thunderx2t99.c
IDAMAXKERNEL = iamax_thunderx2t99.c
ICAMAXKERNEL = izamax_thunderx2t99.c
IZAMAXKERNEL = izamax_thunderx2t99.c
SNRM2KERNEL = scnrm2_thunderx2t99.c
CNRM2KERNEL = scnrm2_thunderx2t99.c
#DNRM2KERNEL = dznrm2_thunderx2t99_fast.c
#ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c
DNRM2KERNEL = dznrm2_thunderx2t99.c
ZNRM2KERNEL = dznrm2_thunderx2t99.c
DDOTKERNEL = dot_thunderx2t99.c
SDOTKERNEL = dot_thunderx2t99.c
CDOTKERNEL = zdot_thunderx2t99.c
ZDOTKERNEL = zdot_thunderx2t99.c
DSDOTKERNEL = dot.S
ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4)
DGEMMKERNEL = dgemm_kernel_8x4_thunderx2t99.S
endif
ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 16x4)
SGEMMKERNEL = sgemm_kernel_16x4_thunderx2t99.S
endif
ifeq ($(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N), 8x4)
CGEMMKERNEL = cgemm_kernel_8x4_thunderx2t99.S
endif
ifeq ($(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N), 4x4)
ZGEMMKERNEL = zgemm_kernel_4x4_thunderx2t99.S
endif

View File

@ -1,3 +1,44 @@
# Big-endian 32bit (AIX) is supported through the POWER6 GEMM kernels, no separate TRMM
ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1)
SGEMMKERNEL = gemm_kernel_power6.S
SGEMMINCOPY =
SGEMMITCOPY =
SGEMMONCOPY = gemm_ncopy_4.S
SGEMMOTCOPY = gemm_tcopy_4.S
SGEMMINCOPYOBJ =
SGEMMITCOPYOBJ =
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel_power6.S
DGEMMINCOPY =
DGEMMITCOPY =
DGEMMONCOPY = gemm_ncopy_4.S
DGEMMOTCOPY = gemm_tcopy_4.S
DGEMMINCOPYOBJ =
DGEMMITCOPYOBJ =
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = zgemm_kernel_power6.S
CGEMMINCOPY = ../generic/zgemm_ncopy_2.c
CGEMMITCOPY = ../generic/zgemm_tcopy_2.c
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_power6.S
ZGEMMINCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMITCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
else
#SGEMM_BETA = ../generic/gemm_beta.c #SGEMM_BETA = ../generic/gemm_beta.c
#DGEMM_BETA = ../generic/gemm_beta.c #DGEMM_BETA = ../generic/gemm_beta.c
#CGEMM_BETA = ../generic/zgemm_beta.c #CGEMM_BETA = ../generic/zgemm_beta.c
@ -47,16 +88,24 @@ ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1)
DTRSMKERNEL_LN = trsm_kernel_power6_LN.S
DTRSMKERNEL_LT = trsm_kernel_power6_LT.S
DTRSMKERNEL_RN = trsm_kernel_power6_LT.S
DTRSMKERNEL_RT = trsm_kernel_power6_RT.S
else
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
endif
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
@ -153,6 +202,10 @@ ZASUMKERNEL = zasum.c
# #
SAXPYKERNEL = saxpy.c SAXPYKERNEL = saxpy.c
DAXPYKERNEL = daxpy.c DAXPYKERNEL = daxpy.c
#
ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1)
CAXPYKERNEL = zaxpy.S
else
ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__)
ifneq ($(GCCVERSIONGTEQ9),1) ifneq ($(GCCVERSIONGTEQ9),1)
CAXPYKERNEL = caxpy_power8.S CAXPYKERNEL = caxpy_power8.S
@ -162,6 +215,7 @@ endif
else else
CAXPYKERNEL = caxpy.c CAXPYKERNEL = caxpy.c
endif endif
endif
# #
ZAXPYKERNEL = zaxpy.c ZAXPYKERNEL = zaxpy.c
# #
@ -239,4 +293,3 @@ IDAMINKERNEL = ../arm/iamin.c
IZAMAXKERNEL = ../arm/izamax.c IZAMAXKERNEL = ../arm/izamax.c
IZAMINKERNEL = ../arm/izamin.c IZAMINKERNEL = ../arm/izamin.c
endif endif

View File

@ -47,8 +47,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(POWER8) || defined(POWER9) || defined(POWER10)
#if defined(__VEC__) || defined(__ALTIVEC__)
#include "casum_microk_power8.c" #include "casum_microk_power8.c"
#endif #endif
#endif
#ifndef HAVE_KERNEL_16 #ifndef HAVE_KERNEL_16

View File

@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(POWER8) || defined(POWER9) || defined(POWER10)
#if defined(__VEC__) || defined(__ALTIVEC__)
#include "ccopy_microk_power8.c" #include "ccopy_microk_power8.c"
#endif #endif
#endif
#ifndef HAVE_KERNEL_32 #ifndef HAVE_KERNEL_32

View File

@ -23,6 +23,9 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#if !defined(__VEC__) || !defined(__ALTIVEC__)
#include "../arm/zdot.c"
#else
#include "common.h" #include "common.h"
#ifndef HAVE_KERNEL_8 #ifndef HAVE_KERNEL_8
@ -168,3 +171,4 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
return (result); return (result);
} }
#endif

View File

@ -23,7 +23,10 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#if !defined(__VEC__) || !defined(__ALTIVEC__)
#include "../arm/zgemv_n.c"
#else
#include <stdlib.h> #include <stdlib.h>
#include <stdio.h> #include <stdio.h>
@ -591,4 +594,4 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
return (0); return (0);
} }
#endif

View File

@ -23,7 +23,10 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#if !defined(__VEC__) || !defined(__ALTIVEC__)
#include "../arm/zgemv_t.c"
#else
#include "common.h" #include "common.h"
@ -595,4 +598,4 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
return (0); return (0);
} }
#endif

View File

@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(POWER8) || defined(POWER9) || defined(POWER10)
#if defined(__VEC__) || defined(__ALTIVEC__)
static void crot_kernel_8 (long n, float *x, float *y, float c, float s) static void crot_kernel_8 (long n, float *x, float *y, float c, float s)
{ {
@ -169,6 +170,7 @@ static void crot_kernel_8 (long n, float *x, float *y, float c, float s)
} }
#endif #endif
#endif
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
@ -183,7 +185,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
if ( (inc_x == 1) && (inc_y == 1) ) if ( (inc_x == 1) && (inc_y == 1) )
{ {
#if defined(__VEC__) || defined(__ALTIVEC__)
BLASLONG n1 = n & -8; BLASLONG n1 = n & -8;
if ( n1 > 0 ) if ( n1 > 0 )
{ {
@ -191,7 +193,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
i=n1; i=n1;
ix=2*n1; ix=2*n1;
} }
#endif
while(i < n) while(i < n)
{ {
temp[0] = c*x[ix] + s*y[ix] ; temp[0] = c*x[ix] + s*y[ix] ;

View File

@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(POWER8) || defined(POWER9) || defined(POWER10)
#if defined(__VEC__) || defined(__ALTIVEC__)
#include "cswap_microk_power8.c" #include "cswap_microk_power8.c"
#endif #endif
#endif
#ifndef HAVE_KERNEL_32 #ifndef HAVE_KERNEL_32

View File

@ -47,8 +47,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(POWER8) || defined(POWER9) || defined(POWER10)
#if defined(__VEC__) || defined(__ALTIVEC__)
#include "dasum_microk_power8.c" #include "dasum_microk_power8.c"
#endif #endif
#endif
#ifndef HAVE_KERNEL_16 #ifndef HAVE_KERNEL_16

View File

@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(POWER8) || defined(POWER9) || defined(POWER10)
#if defined(__VEC__) || defined(__ALTIVEC__)
#include "daxpy_microk_power8.c" #include "daxpy_microk_power8.c"
#endif #endif
#endif
#ifndef HAVE_KERNEL_8 #ifndef HAVE_KERNEL_8

View File

@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(POWER8) || defined(POWER9) || defined(POWER10)
#if defined(__VEC__) || defined(__ALTIVEC__)
#include "dcopy_microk_power8.c" #include "dcopy_microk_power8.c"
#endif #endif
#endif
#ifndef HAVE_KERNEL_32 #ifndef HAVE_KERNEL_32

View File

@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(POWER8) || defined(POWER9) || defined(POWER10)
#if defined(__VEC__) || defined(__ALTIVEC__)
#include "ddot_microk_power8.c" #include "ddot_microk_power8.c"
#endif #endif
#endif
#ifndef HAVE_KERNEL_8 #ifndef HAVE_KERNEL_8

View File

@ -27,64 +27,64 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <altivec.h> #include <altivec.h>
typedef unsigned char vec_t __attribute__ ((vector_size (16))); typedef __vector unsigned char vec_t;
typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
#ifdef TRMMKERNEL #ifdef TRMMKERNEL
#define SAVE_ACC(ACC, J) \ #define SAVE_ACC(ACC, J) \
__builtin_mma_disassemble_acc (result, ACC); \ __builtin_mma_disassemble_acc ((void *)result, ACC); \
rowC = (v4sf_t *) &CO[0* ldc+J]; \ rowC = (v4sf_t *) &CO[0* ldc+J]; \
rowC[0] = result[3] * alpha; \ rowC[0] = result[0] * alpha; \
rowC = (v4sf_t *) &CO[1*ldc+J]; \ rowC = (v4sf_t *) &CO[1*ldc+J]; \
rowC[0] = result[2] * alpha; \ rowC[0] = result[1] * alpha; \
rowC = (v4sf_t *) &CO[2*ldc+J]; \ rowC = (v4sf_t *) &CO[2*ldc+J]; \
rowC[0] = result[1] * alpha; \
rowC = (v4sf_t *) &CO[3*ldc+J]; \
rowC[0] = result[0] * alpha;
#define SAVE_ACC1(ACC, J) \
__builtin_mma_disassemble_acc (result, ACC); \
rowC = (v4sf_t *) &CO[4* ldc+J]; \
rowC[0] = result[3] * alpha; \
rowC = (v4sf_t *) &CO[5*ldc+J]; \
rowC[0] = result[2] * alpha; \ rowC[0] = result[2] * alpha; \
rowC = (v4sf_t *) &CO[6*ldc+J]; \ rowC = (v4sf_t *) &CO[3*ldc+J]; \
rowC[0] = result[3] * alpha;
#define SAVE_ACC1(ACC, J) \
__builtin_mma_disassemble_acc ((void *)result, ACC); \
rowC = (v4sf_t *) &CO[4* ldc+J]; \
rowC[0] = result[0] * alpha; \
rowC = (v4sf_t *) &CO[5*ldc+J]; \
rowC[0] = result[1] * alpha; \ rowC[0] = result[1] * alpha; \
rowC = (v4sf_t *) &CO[6*ldc+J]; \
rowC[0] = result[2] * alpha; \
rowC = (v4sf_t *) &CO[7*ldc+J]; \ rowC = (v4sf_t *) &CO[7*ldc+J]; \
rowC[0] = result[0] * alpha; rowC[0] = result[3] * alpha;
#define SAVE2x4_ACC(ACC, J) \ #define SAVE2x4_ACC(ACC, J) \
__builtin_mma_disassemble_acc (result, ACC); \ __builtin_mma_disassemble_acc ((void *)result, ACC); \
rowC = (v4sf_t *) &CO[0* ldc+J]; \ rowC = (v4sf_t *) &CO[0* ldc+J]; \
rowC[0] = result[3] * alpha; \ rowC[0] = result[0] * alpha; \
rowC = (v4sf_t *) &CO[1* ldc+J]; \ rowC = (v4sf_t *) &CO[1* ldc+J]; \
rowC[0] = result[2] * alpha; rowC[0] = result[1] * alpha;
#else #else
#define SAVE_ACC(ACC, J) \ #define SAVE_ACC(ACC, J) \
__builtin_mma_disassemble_acc (result, ACC); \ __builtin_mma_disassemble_acc ((void *)result, ACC); \
rowC = (v4sf_t *) &CO[0* ldc+J]; \ rowC = (v4sf_t *) &CO[0* ldc+J]; \
rowC[0] += result[3] * alpha; \ rowC[0] += result[0] * alpha; \
rowC = (v4sf_t *) &CO[1*ldc+J]; \ rowC = (v4sf_t *) &CO[1*ldc+J]; \
rowC[0] += result[2] * alpha; \ rowC[0] += result[1] * alpha; \
rowC = (v4sf_t *) &CO[2*ldc+J]; \ rowC = (v4sf_t *) &CO[2*ldc+J]; \
rowC[0] += result[1] * alpha; \
rowC = (v4sf_t *) &CO[3*ldc+J]; \
rowC[0] += result[0] * alpha;
#define SAVE_ACC1(ACC, J) \
__builtin_mma_disassemble_acc (result, ACC); \
rowC = (v4sf_t *) &CO[4* ldc+J]; \
rowC[0] += result[3] * alpha; \
rowC = (v4sf_t *) &CO[5*ldc+J]; \
rowC[0] += result[2] * alpha; \ rowC[0] += result[2] * alpha; \
rowC = (v4sf_t *) &CO[6*ldc+J]; \ rowC = (v4sf_t *) &CO[3*ldc+J]; \
rowC[0] += result[3] * alpha;
#define SAVE_ACC1(ACC, J) \
__builtin_mma_disassemble_acc ((void *)result, ACC); \
rowC = (v4sf_t *) &CO[4* ldc+J]; \
rowC[0] += result[0] * alpha; \
rowC = (v4sf_t *) &CO[5*ldc+J]; \
rowC[0] += result[1] * alpha; \ rowC[0] += result[1] * alpha; \
rowC = (v4sf_t *) &CO[6*ldc+J]; \
rowC[0] += result[2] * alpha; \
rowC = (v4sf_t *) &CO[7*ldc+J]; \ rowC = (v4sf_t *) &CO[7*ldc+J]; \
rowC[0] += result[0] * alpha; rowC[0] += result[3] * alpha;
#define SAVE2x4_ACC(ACC, J) \ #define SAVE2x4_ACC(ACC, J) \
__builtin_mma_disassemble_acc (result, ACC); \ __builtin_mma_disassemble_acc ((void *)result, ACC); \
rowC = (v4sf_t *) &CO[0* ldc+J]; \ rowC = (v4sf_t *) &CO[0* ldc+J]; \
rowC[0] += result[3] * alpha; \ rowC[0] += result[0] * alpha; \
rowC = (v4sf_t *) &CO[1* ldc+J]; \ rowC = (v4sf_t *) &CO[1* ldc+J]; \
rowC[0] += result[2] * alpha; rowC[0] += result[1] * alpha;
#endif #endif
#define SET_ACC_ZERO4() \ #define SET_ACC_ZERO4() \

View File

@ -39,8 +39,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(POWER8) || defined(POWER9) || defined(POWER10)
#if defined(__VEC__) || defined(__ALTIVEC__)
#include "dgemv_n_microk_power8.c" #include "dgemv_n_microk_power8.c"
#endif #endif
#endif
#define NBMAX 4096 #define NBMAX 4096

View File

@ -25,15 +25,19 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#if !defined(__VEC__) || !defined(__ALTIVEC__)
#include "../arm/gemv_t.c"
#else
#include "common.h" #include "common.h"
#define NBMAX 1024 #define NBMAX 1024
//#define PREFETCH 1 //#define PREFETCH 1
#include <altivec.h> #include <altivec.h>
#define HAVE_KERNEL4x8_ASM 1 #define HAVE_KERNEL4x8_ASM 1
#if defined(HAVE_KERNEL4x8_ASM) #if defined(HAVE_KERNEL4x8_ASM)
static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, double *y, double alpha) { static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, double *y, double alpha) {
@ -355,7 +359,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
"stxvd2x 39, %[off], %[y] \n\t" "stxvd2x 39, %[off], %[y] \n\t"
"stxvd2x 40, %[off2], %[y] \n\t" "stxvd2x 40, %[off2], %[y] \n\t"
: [memy] "+m" (*(const double (*)[8])y), : [memy] "+m" (*(double (*)[8])y),
[n] "+&r" (n), [n] "+&r" (n),
[a0] "=b" (a0), [a0] "=b" (a0),
[a1] "=&b" (a1), [a1] "=&b" (a1),
@ -369,7 +373,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
[off2]"=&b" (off2), [off2]"=&b" (off2),
[temp] "=&b" (tempR) [temp] "=&b" (tempR)
: [memx] "m" (*(const double (*)[n])x), : [memx] "m" (*(const double (*)[n])x),
[mem_ap] "m" (*(const double (*)[]) ap), [mem_ap] "m" (*(const double (*)[n*8]) ap),
[alpha] "d" (alpha), [alpha] "d" (alpha),
"[a0]" (ap), "[a0]" (ap),
[x] "b" (x), [x] "b" (x),
@ -883,4 +887,5 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
return (0); return (0);
} }
#endif

View File

@ -40,8 +40,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma GCC optimize "O1" #pragma GCC optimize "O1"
#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(POWER8) || defined(POWER9) || defined(POWER10)
#if defined(__VEC__) || defined(__ALTIVEC__)
#include "drot_microk_power8.c" #include "drot_microk_power8.c"
#endif #endif
#endif
#ifndef HAVE_KERNEL_16 #ifndef HAVE_KERNEL_16

View File

@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(POWER8) || defined(POWER9) || defined(POWER10)
#if defined(__VEC__) || defined(__ALTIVEC__)
#include "dscal_microk_power8.c" #include "dscal_microk_power8.c"
#endif #endif
#endif
#if !defined(HAVE_KERNEL_8) #if !defined(HAVE_KERNEL_8)

View File

@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(POWER8) || defined(POWER9) || defined(POWER10)
#if defined(__VEC__) || defined(__ALTIVEC__)
#include "dswap_microk_power8.c" #include "dswap_microk_power8.c"
#endif #endif
#endif
#ifndef HAVE_KERNEL_32 #ifndef HAVE_KERNEL_32

View File

@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(__VEC__) || defined(__ALTIVEC__)
#include <altivec.h> #include <altivec.h>
#endif
#if defined(DOUBLE) #if defined(DOUBLE)
#define ABS fabs #define ABS fabs
@ -37,6 +40,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#if defined(__VEC__) || defined(__ALTIVEC__)
/** /**
* Find maximum index * Find maximum index
* Warning: requirements n>0 and n % 32 == 0 * Warning: requirements n>0 and n % 32 == 0
@ -313,6 +318,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
return index; return index;
} }
#endif
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0; BLASLONG i = 0;
@ -326,12 +332,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG n1 = n & -32; BLASLONG n1 = n & -32;
#if defined(_CALL_ELF) && (_CALL_ELF == 2) #if defined(_CALL_ELF) && (_CALL_ELF == 2)
#if defined(__VEC__) || defined(__ALTIVEC__)
if (n1 > 0) { if (n1 > 0) {
max = diamax_kernel_32(n1, x, &maxf); max = diamax_kernel_32(n1, x, &maxf);
i = n1; i = n1;
} }
#endif
#endif #endif
while (i < n) { while (i < n) {
if (ABS(x[i]) > maxf) { if (ABS(x[i]) > maxf) {

View File

@ -37,6 +37,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#if defined(__VEC__) || defined(__ALTIVEC__)
/** /**
* Find minimum index * Find minimum index
* Warning: requirements n>0 and n % 32 == 0 * Warning: requirements n>0 and n % 32 == 0
@ -313,7 +315,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
return index; return index;
} }
#endif
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
@ -327,12 +329,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
if (inc_x == 1) { if (inc_x == 1) {
#if defined(_CALL_ELF) && (_CALL_ELF == 2) #if defined(_CALL_ELF) && (_CALL_ELF == 2)
#if defined(__VEC__) || defined(__ALTIVEC__)
BLASLONG n1 = n & -32; BLASLONG n1 = n & -32;
if (n1 > 0) { if (n1 > 0) {
min = diamin_kernel_32(n1, x, &minf); min = diamin_kernel_32(n1, x, &minf);
i = n1; i = n1;
} }
#endif
#endif #endif
while (i < n) { while (i < n) {
if (ABS(x[i]) < minf) { if (ABS(x[i]) < minf) {

View File

@ -34,6 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(__VEC__) || defined(__ALTIVEC__)
/** /**
* Find maximum index * Find maximum index
@ -299,7 +300,7 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
} }
#endif
@ -317,6 +318,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
if (inc_x == 1) { if (inc_x == 1) {
#if defined(_CALL_ELF) && (_CALL_ELF == 2) #if defined(_CALL_ELF) && (_CALL_ELF == 2)
#if defined(__VEC__) || defined(__ALTIVEC__)
BLASLONG n1 = n & -16; BLASLONG n1 = n & -16;
if (n1 > 0) { if (n1 > 0) {
@ -324,6 +327,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
i = n1; i = n1;
ix = n1 << 1; ix = n1 << 1;
} }
#endif
#endif #endif
while(i < n) while(i < n)

View File

@ -25,13 +25,13 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#define ABS fabs #define ABS fabs
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) #define CABS1(x,i) ABS(x[i])+ABS(x[i+1])
#if defined(__VEC__) || defined(__ALTIVEC__)
/** /**
* Find minimum index * Find minimum index
@ -296,6 +296,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
return index; return index;
} }
#endif
@ -316,6 +317,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
minf = CABS1(x,0); //index will not be incremented minf = CABS1(x,0); //index will not be incremented
#if defined(_CALL_ELF) && (_CALL_ELF == 2) #if defined(_CALL_ELF) && (_CALL_ELF == 2)
#if defined(__VEC__) || defined(__ALTIVEC__)
BLASLONG n1 = n & -16; BLASLONG n1 = n & -16;
if (n1 > 0) { if (n1 > 0) {
@ -323,6 +326,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
i = n1; i = n1;
ix = n1 << 1; ix = n1 << 1;
} }
#endif
#endif #endif
while(i < n) while(i < n)
@ -359,5 +363,3 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
} }
} }

View File

@ -47,8 +47,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(POWER8) || defined(POWER9) || defined(POWER10)
#if defined(__VEC__) || defined(__ALTIVEC__)
#include "sasum_microk_power8.c" #include "sasum_microk_power8.c"
#endif #endif
#endif
#ifndef HAVE_KERNEL_32 #ifndef HAVE_KERNEL_32

View File

@ -28,8 +28,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#define offset_0 0
#define offset_1 16
#define offset_2 32
#define offset_3 48
#define offset_4 64
#define offset_5 80
#define offset_6 96
#define offset_7 112
#define offset_8 128
#define offset_9 144
#define offset_10 160
#define offset_11 176
#define offset_12 192
#define offset_13 208
#define offset_14 224
#define offset_15 240
#if defined(__VEC__) || defined(__ALTIVEC__)
#ifndef HAVE_KERNEL_8 #ifndef HAVE_KERNEL_8
#include <altivec.h> #include <altivec.h>
@ -37,12 +54,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha) static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha)
{ {
BLASLONG i = 0; BLASLONG i = 0;
__vector float v_a = {alpha,alpha,alpha,alpha}; __vector float v_a __attribute((aligned(16))) = {alpha,alpha,alpha,alpha};
__vector float * v_y=(__vector float *)y; __vector float * vptr_y =(__vector float *)y;
__vector float * v_x=(__vector float *)x; __vector float * vptr_x =(__vector float *)x;
for(; i<n/4; i+=16){ for(; i<n/4; i+=16){
register __vector float vy_0 = vec_vsx_ld( offset_0 ,vptr_y ) ;
register __vector float vy_1 = vec_vsx_ld( offset_1 ,vptr_y ) ;
register __vector float vy_2 = vec_vsx_ld( offset_2 ,vptr_y ) ;
register __vector float vy_3 = vec_vsx_ld( offset_3 ,vptr_y ) ;
register __vector float vy_4 = vec_vsx_ld( offset_4 ,vptr_y ) ;
register __vector float vy_5 = vec_vsx_ld( offset_5 ,vptr_y ) ;
register __vector float vy_6 = vec_vsx_ld( offset_6 ,vptr_y ) ;
register __vector float vy_7 = vec_vsx_ld( offset_7 ,vptr_y ) ;
register __vector float vy_8 = vec_vsx_ld( offset_8 ,vptr_y ) ;
register __vector float vy_9 = vec_vsx_ld( offset_9 ,vptr_y ) ;
register __vector float vy_10 = vec_vsx_ld( offset_10 ,vptr_y ) ;
register __vector float vy_11 = vec_vsx_ld( offset_11 ,vptr_y ) ;
register __vector float vy_12 = vec_vsx_ld( offset_12 ,vptr_y ) ;
register __vector float vy_13 = vec_vsx_ld( offset_13 ,vptr_y ) ;
register __vector float vy_14 = vec_vsx_ld( offset_14 ,vptr_y ) ;
register __vector float vy_15 = vec_vsx_ld( offset_15 ,vptr_y ) ;
register __vector float vx_0 = vec_vsx_ld( offset_0 ,vptr_x ) ;
register __vector float vx_1 = vec_vsx_ld( offset_1 ,vptr_x ) ;
register __vector float vx_2 = vec_vsx_ld( offset_2 ,vptr_x ) ;
register __vector float vx_3 = vec_vsx_ld( offset_3 ,vptr_x ) ;
register __vector float vx_4 = vec_vsx_ld( offset_4 ,vptr_x ) ;
register __vector float vx_5 = vec_vsx_ld( offset_5 ,vptr_x ) ;
register __vector float vx_6 = vec_vsx_ld( offset_6 ,vptr_x ) ;
register __vector float vx_7 = vec_vsx_ld( offset_7 ,vptr_x ) ;
register __vector float vx_8 = vec_vsx_ld( offset_8 ,vptr_x ) ;
register __vector float vx_9 = vec_vsx_ld( offset_9 ,vptr_x ) ;
register __vector float vx_10 = vec_vsx_ld( offset_10 ,vptr_x ) ;
register __vector float vx_11 = vec_vsx_ld( offset_11 ,vptr_x ) ;
register __vector float vx_12 = vec_vsx_ld( offset_12 ,vptr_x ) ;
register __vector float vx_13 = vec_vsx_ld( offset_13 ,vptr_x ) ;
register __vector float vx_14 = vec_vsx_ld( offset_14 ,vptr_x ) ;
register __vector float vx_15 = vec_vsx_ld( offset_15 ,vptr_x ) ;
vy_0 += vx_0*v_a;
vy_1 += vx_1*v_a;
vy_2 += vx_2*v_a;
vy_3 += vx_3*v_a;
vy_4 += vx_4*v_a;
vy_5 += vx_5*v_a;
vy_6 += vx_6*v_a;
vy_7 += vx_7*v_a;
vy_8 += vx_8*v_a;
vy_9 += vx_9*v_a;
vy_10 += vx_10*v_a;
vy_11 += vx_11*v_a;
vy_12 += vx_12*v_a;
vy_13 += vx_13*v_a;
vy_14 += vx_14*v_a;
vy_15 += vx_15*v_a;
vec_vsx_st( vy_0, offset_0 ,vptr_y ) ;
vec_vsx_st( vy_1, offset_1 ,vptr_y ) ;
vec_vsx_st( vy_2, offset_2 ,vptr_y ) ;
vec_vsx_st( vy_3, offset_3 ,vptr_y ) ;
vec_vsx_st( vy_4, offset_4 ,vptr_y ) ;
vec_vsx_st( vy_5, offset_5 ,vptr_y ) ;
vec_vsx_st( vy_6, offset_6 ,vptr_y ) ;
vec_vsx_st( vy_7, offset_7 ,vptr_y ) ;
vec_vsx_st( vy_8, offset_8 ,vptr_y ) ;
vec_vsx_st( vy_9, offset_9 ,vptr_y ) ;
vec_vsx_st( vy_10, offset_10 ,vptr_y ) ;
vec_vsx_st( vy_11, offset_11 ,vptr_y ) ;
vec_vsx_st( vy_12, offset_12 ,vptr_y ) ;
vec_vsx_st( vy_13, offset_13 ,vptr_y ) ;
vec_vsx_st( vy_14, offset_14 ,vptr_y ) ;
vec_vsx_st( vy_15, offset_15 ,vptr_y ) ;
vptr_x+=16;
vptr_y+=16;
/*
v_y[i] += v_a * v_x[i]; v_y[i] += v_a * v_x[i];
v_y[i+1] += v_a * v_x[i+1]; v_y[i+1] += v_a * v_x[i+1];
v_y[i+2] += v_a * v_x[i+2]; v_y[i+2] += v_a * v_x[i+2];
@ -59,9 +149,11 @@ static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha)
v_y[i+13] += v_a * v_x[i+13]; v_y[i+13] += v_a * v_x[i+13];
v_y[i+14] += v_a * v_x[i+14]; v_y[i+14] += v_a * v_x[i+14];
v_y[i+15] += v_a * v_x[i+15]; v_y[i+15] += v_a * v_x[i+15];
*/
} }
} }
#endif #endif
#endif
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{ {
@ -74,11 +166,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
{ {
BLASLONG n1 = n & -64; BLASLONG n1 = n & -64;
#if defined(__VEC__) || defined(__ALTIVEC__)
if ( n1 ) if ( n1 )
saxpy_kernel_64(n1, x, y, da); saxpy_kernel_64(n1, x, y, da);
i = n1; i = n1;
#endif
while(i < n) while(i < n)
{ {

View File

@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(POWER8) || defined(POWER9) || defined(POWER10)
#if defined(__VEC__) || defined(__ALTIVEC__)
#include "scopy_microk_power8.c" #include "scopy_microk_power8.c"
#endif #endif
#endif
#ifndef HAVE_KERNEL_32 #ifndef HAVE_KERNEL_32

View File

@ -36,8 +36,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(POWER8) || defined(POWER9) || defined(POWER10)
#if defined(__VEC__) || defined(__ALTIVEC__)
#include "sdot_microk_power8.c" #include "sdot_microk_power8.c"
#endif #endif
#endif
#ifndef HAVE_KERNEL_16 #ifndef HAVE_KERNEL_16

View File

@ -27,103 +27,103 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <altivec.h> #include <altivec.h>
typedef unsigned char vec_t __attribute__ ((vector_size (16))); typedef __vector unsigned char vec_t;
typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
#if defined(TRMMKERNEL) #if defined(TRMMKERNEL)
#define SAVE_ACC(ACC, J) \ #define SAVE_ACC(ACC, J) \
__builtin_mma_disassemble_acc (result, ACC); \ __builtin_mma_disassemble_acc ((void *)result, ACC); \
rowC = (v4sf_t *) &CO[0* ldc+J]; \ rowC = (v4sf_t *) &CO[0* ldc+J]; \
rowC[0] = result[3] * alpha; \ rowC[0] = result[0] * alpha; \
rowC = (v4sf_t *) &CO[1*ldc+J]; \ rowC = (v4sf_t *) &CO[1*ldc+J]; \
rowC[0] = result[2] * alpha; \ rowC[0] = result[1] * alpha; \
rowC = (v4sf_t *) &CO[2*ldc+J]; \ rowC = (v4sf_t *) &CO[2*ldc+J]; \
rowC[0] = result[1] * alpha; \ rowC[0] = result[2] * alpha; \
rowC = (v4sf_t *) &CO[3*ldc+J]; \ rowC = (v4sf_t *) &CO[3*ldc+J]; \
rowC[0] = result[0] * alpha; rowC[0] = result[3] * alpha;
#define SAVE_ACC1(ACC, J) \ #define SAVE_ACC1(ACC, J) \
__builtin_mma_disassemble_acc (result, ACC); \ __builtin_mma_disassemble_acc ((void *)result, ACC); \
rowC = (v4sf_t *) &CO[4* ldc+J]; \ rowC = (v4sf_t *) &CO[4* ldc+J]; \
rowC[0] = result[3] * alpha; \ rowC[0] = result[0] * alpha; \
rowC = (v4sf_t *) &CO[5*ldc+J]; \ rowC = (v4sf_t *) &CO[5*ldc+J]; \
rowC[0] = result[2] * alpha; \
rowC = (v4sf_t *) &CO[6*ldc+J]; \
rowC[0] = result[1] * alpha; \ rowC[0] = result[1] * alpha; \
rowC = (v4sf_t *) &CO[6*ldc+J]; \
rowC[0] = result[2] * alpha; \
rowC = (v4sf_t *) &CO[7*ldc+J]; \ rowC = (v4sf_t *) &CO[7*ldc+J]; \
rowC[0] = result[0] * alpha; rowC[0] = result[3] * alpha;
#define SAVE4x2_ACC(ACC, J) \ #define SAVE4x2_ACC(ACC, J) \
__builtin_mma_disassemble_acc (result, ACC); \ __builtin_mma_disassemble_acc ((void *)result, ACC); \
rowC = (v2sf_t *) &CO[0* ldc+J]; \ rowC = (v2sf_t *) &CO[0* ldc+J]; \
rowC[0] = result[6] * alpha; \ rowC[0] = result[0] * alpha; \
rowC = (v2sf_t *) &CO[1* ldc+J]; \ rowC = (v2sf_t *) &CO[1* ldc+J]; \
rowC[0] = result[4] * alpha; \ rowC[0] = result[2] * alpha; \
rowC = (v2sf_t *) &CO[2* ldc+J]; \ rowC = (v2sf_t *) &CO[2* ldc+J]; \
rowC[0] = result[2] * alpha; \
rowC = (v2sf_t *) &CO[3* ldc+J]; \
rowC[0] = result[0] * alpha;
#define SAVE4x2_ACC1(ACC, J) \
__builtin_mma_disassemble_acc (result, ACC); \
rowC = (v2sf_t *) &CO[4* ldc+J]; \
rowC[0] = result[6] * alpha; \
rowC = (v2sf_t *) &CO[5* ldc+J]; \
rowC[0] = result[4] * alpha; \ rowC[0] = result[4] * alpha; \
rowC = (v2sf_t *) &CO[6* ldc+J]; \ rowC = (v2sf_t *) &CO[3* ldc+J]; \
rowC[0] = result[6] * alpha;
#define SAVE4x2_ACC1(ACC, J) \
__builtin_mma_disassemble_acc ((void *)result, ACC); \
rowC = (v2sf_t *) &CO[4* ldc+J]; \
rowC[0] = result[0] * alpha; \
rowC = (v2sf_t *) &CO[5* ldc+J]; \
rowC[0] = result[2] * alpha; \ rowC[0] = result[2] * alpha; \
rowC = (v2sf_t *) &CO[6* ldc+J]; \
rowC[0] = result[4] * alpha; \
rowC = (v2sf_t *) &CO[7* ldc+J]; \ rowC = (v2sf_t *) &CO[7* ldc+J]; \
rowC[0] = result[0] * alpha; rowC[0] = result[6] * alpha;
#define SAVE2x4_ACC(ACC, J) \ #define SAVE2x4_ACC(ACC, J) \
__builtin_mma_disassemble_acc (result, ACC); \ __builtin_mma_disassemble_acc ((void *)result, ACC); \
rowC = (v4sf_t *) &CO[0* ldc+J]; \ rowC = (v4sf_t *) &CO[0* ldc+J]; \
rowC[0] = result[3] * alpha; \ rowC[0] = result[0] * alpha; \
rowC = (v4sf_t *) &CO[1* ldc+J]; \ rowC = (v4sf_t *) &CO[1* ldc+J]; \
rowC[0] = result[2] * alpha; rowC[0] = result[1] * alpha;
#else #else
#define SAVE_ACC(ACC, J) \ #define SAVE_ACC(ACC, J) \
__builtin_mma_disassemble_acc (result, ACC); \ __builtin_mma_disassemble_acc ((void *)result, ACC); \
rowC = (v4sf_t *) &CO[0* ldc+J]; \ rowC = (v4sf_t *) &CO[0* ldc+J]; \
rowC[0] += result[3] * alpha; \ rowC[0] += result[0] * alpha; \
rowC = (v4sf_t *) &CO[1*ldc+J]; \ rowC = (v4sf_t *) &CO[1*ldc+J]; \
rowC[0] += result[2] * alpha; \ rowC[0] += result[1] * alpha; \
rowC = (v4sf_t *) &CO[2*ldc+J]; \ rowC = (v4sf_t *) &CO[2*ldc+J]; \
rowC[0] += result[1] * alpha; \ rowC[0] += result[2] * alpha; \
rowC = (v4sf_t *) &CO[3*ldc+J]; \ rowC = (v4sf_t *) &CO[3*ldc+J]; \
rowC[0] += result[0] * alpha; rowC[0] += result[3] * alpha;
#define SAVE_ACC1(ACC, J) \ #define SAVE_ACC1(ACC, J) \
__builtin_mma_disassemble_acc (result, ACC); \ __builtin_mma_disassemble_acc ((void *)result, ACC); \
rowC = (v4sf_t *) &CO[4* ldc+J]; \ rowC = (v4sf_t *) &CO[4* ldc+J]; \
rowC[0] += result[3] * alpha; \ rowC[0] += result[0] * alpha; \
rowC = (v4sf_t *) &CO[5*ldc+J]; \ rowC = (v4sf_t *) &CO[5*ldc+J]; \
rowC[0] += result[2] * alpha; \
rowC = (v4sf_t *) &CO[6*ldc+J]; \
rowC[0] += result[1] * alpha; \ rowC[0] += result[1] * alpha; \
rowC = (v4sf_t *) &CO[6*ldc+J]; \
rowC[0] += result[2] * alpha; \
rowC = (v4sf_t *) &CO[7*ldc+J]; \ rowC = (v4sf_t *) &CO[7*ldc+J]; \
rowC[0] += result[0] * alpha; rowC[0] += result[3] * alpha;
#define SAVE4x2_ACC(ACC, J) \ #define SAVE4x2_ACC(ACC, J) \
__builtin_mma_disassemble_acc (result, ACC); \ __builtin_mma_disassemble_acc ((void *)result, ACC); \
rowC = (v2sf_t *) &CO[0* ldc+J]; \ rowC = (v2sf_t *) &CO[0* ldc+J]; \
rowC[0] += result[6] * alpha; \ rowC[0] += result[0] * alpha; \
rowC = (v2sf_t *) &CO[1* ldc+J]; \ rowC = (v2sf_t *) &CO[1* ldc+J]; \
rowC[0] += result[4] * alpha; \ rowC[0] += result[2] * alpha; \
rowC = (v2sf_t *) &CO[2* ldc+J]; \ rowC = (v2sf_t *) &CO[2* ldc+J]; \
rowC[0] += result[2] * alpha; \
rowC = (v2sf_t *) &CO[3* ldc+J]; \
rowC[0] += result[0] * alpha;
#define SAVE4x2_ACC1(ACC, J) \
__builtin_mma_disassemble_acc (result, ACC); \
rowC = (v2sf_t *) &CO[4* ldc+J]; \
rowC[0] += result[6] * alpha; \
rowC = (v2sf_t *) &CO[5* ldc+J]; \
rowC[0] += result[4] * alpha; \ rowC[0] += result[4] * alpha; \
rowC = (v2sf_t *) &CO[6* ldc+J]; \ rowC = (v2sf_t *) &CO[3* ldc+J]; \
rowC[0] += result[6] * alpha;
#define SAVE4x2_ACC1(ACC, J) \
__builtin_mma_disassemble_acc ((void *)result, ACC); \
rowC = (v2sf_t *) &CO[4* ldc+J]; \
rowC[0] += result[0] * alpha; \
rowC = (v2sf_t *) &CO[5* ldc+J]; \
rowC[0] += result[2] * alpha; \ rowC[0] += result[2] * alpha; \
rowC = (v2sf_t *) &CO[6* ldc+J]; \
rowC[0] += result[4] * alpha; \
rowC = (v2sf_t *) &CO[7* ldc+J]; \ rowC = (v2sf_t *) &CO[7* ldc+J]; \
rowC[0] += result[0] * alpha; rowC[0] += result[6] * alpha;
#define SAVE2x4_ACC(ACC, J) \ #define SAVE2x4_ACC(ACC, J) \
__builtin_mma_disassemble_acc (result, ACC); \ __builtin_mma_disassemble_acc ((void *)result, ACC); \
rowC = (v4sf_t *) &CO[0* ldc+J]; \ rowC = (v4sf_t *) &CO[0* ldc+J]; \
rowC[0] += result[3] * alpha; \ rowC[0] += result[0] * alpha; \
rowC = (v4sf_t *) &CO[1* ldc+J]; \ rowC = (v4sf_t *) &CO[1* ldc+J]; \
rowC[0] += result[2] * alpha; rowC[0] += result[1] * alpha;
#endif #endif
#define KERNEL(i, j) \ #define KERNEL(i, j) \
__builtin_mma_xvf32gerpp (&acc0, rowB[i], rowA[j]); \ __builtin_mma_xvf32gerpp (&acc0, rowB[i], rowA[j]); \

View File

@ -24,7 +24,10 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#if !defined(__VEC__) || !defined(__ALTIVEC__)
#include "../arm/gemv_n.c"
#else
#include "common.h" #include "common.h"
@ -463,4 +466,5 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
return(0); return(0);
} }
#endif

View File

@ -24,6 +24,10 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#if !defined(__VEC__) || !defined(__ALTIVEC__)
#include "../arm/gemv_t.c"
#else
#include "common.h" #include "common.h"
@ -477,3 +481,4 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
} }
#endif

View File

@ -45,7 +45,7 @@ bfloat16tof32 (bfloat16 f16)
#define BF16TOF32(x) x #define BF16TOF32(x) x
#endif #endif
typedef unsigned char vec_t __attribute__ ((vector_size (16))); typedef __vector unsigned char vec_t;
typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
@ -64,54 +64,54 @@ vector char mask =
#define MERGE_LOW(x, y) (vec_t) vec_mergel ((vector short)x, (vector short)y) #define MERGE_LOW(x, y) (vec_t) vec_mergel ((vector short)x, (vector short)y)
#define SAVE_ACC(ACC, J) \ #define SAVE_ACC(ACC, J) \
__builtin_mma_disassemble_acc (result, ACC); \ __builtin_mma_disassemble_acc ((void *)result, ACC); \
rowC = (v4sf_t *) &CO[0* ldc+J]; \ rowC = (v4sf_t *) &CO[0* ldc+J]; \
rowC[0] += result[3] * alpha; \ rowC[0] += result[0] * alpha; \
rowC = (v4sf_t *) &CO[1*ldc+J]; \ rowC = (v4sf_t *) &CO[1*ldc+J]; \
rowC[0] += result[2] * alpha; \ rowC[0] += result[1] * alpha; \
rowC = (v4sf_t *) &CO[2*ldc+J]; \ rowC = (v4sf_t *) &CO[2*ldc+J]; \
rowC[0] += result[1] * alpha; \ rowC[0] += result[2] * alpha; \
rowC = (v4sf_t *) &CO[3*ldc+J]; \ rowC = (v4sf_t *) &CO[3*ldc+J]; \
rowC[0] += result[0] * alpha; rowC[0] += result[3] * alpha;
#define SAVE_ACC1(ACC, J) \ #define SAVE_ACC1(ACC, J) \
__builtin_mma_disassemble_acc (result, ACC); \ __builtin_mma_disassemble_acc ((void *)result, ACC); \
rowC = (v4sf_t *) &CO[4* ldc+J]; \ rowC = (v4sf_t *) &CO[4* ldc+J]; \
rowC[0] += result[3] * alpha; \ rowC[0] += result[0] * alpha; \
rowC = (v4sf_t *) &CO[5*ldc+J]; \ rowC = (v4sf_t *) &CO[5*ldc+J]; \
rowC[0] += result[2] * alpha; \
rowC = (v4sf_t *) &CO[6*ldc+J]; \
rowC[0] += result[1] * alpha; \ rowC[0] += result[1] * alpha; \
rowC = (v4sf_t *) &CO[6*ldc+J]; \
rowC[0] += result[2] * alpha; \
rowC = (v4sf_t *) &CO[7*ldc+J]; \ rowC = (v4sf_t *) &CO[7*ldc+J]; \
rowC[0] += result[0] * alpha; rowC[0] += result[3] * alpha;
#define SAVE4x2_ACC(ACC, J) \ #define SAVE4x2_ACC(ACC, J) \
__builtin_mma_disassemble_acc (result, ACC); \ __builtin_mma_disassemble_acc ((void *)result, ACC); \
rowC = (v2sf_t *) &CO[0* ldc+J]; \ rowC = (v2sf_t *) &CO[0* ldc+J]; \
rowC[0] += result[6] * alpha; \ rowC[0] += result[0] * alpha; \
rowC = (v2sf_t *) &CO[1* ldc+J]; \ rowC = (v2sf_t *) &CO[1* ldc+J]; \
rowC[0] += result[4] * alpha; \ rowC[0] += result[2] * alpha; \
rowC = (v2sf_t *) &CO[2* ldc+J]; \ rowC = (v2sf_t *) &CO[2* ldc+J]; \
rowC[0] += result[2] * alpha; \
rowC = (v2sf_t *) &CO[3* ldc+J]; \
rowC[0] += result[0] * alpha;
#define SAVE4x2_ACC1(ACC, J) \
__builtin_mma_disassemble_acc (result, ACC); \
rowC = (v2sf_t *) &CO[4* ldc+J]; \
rowC[0] += result[6] * alpha; \
rowC = (v2sf_t *) &CO[5* ldc+J]; \
rowC[0] += result[4] * alpha; \ rowC[0] += result[4] * alpha; \
rowC = (v2sf_t *) &CO[6* ldc+J]; \ rowC = (v2sf_t *) &CO[3* ldc+J]; \
rowC[0] += result[6] * alpha;
#define SAVE4x2_ACC1(ACC, J) \
__builtin_mma_disassemble_acc ((void *)result, ACC); \
rowC = (v2sf_t *) &CO[4* ldc+J]; \
rowC[0] += result[0] * alpha; \
rowC = (v2sf_t *) &CO[5* ldc+J]; \
rowC[0] += result[2] * alpha; \ rowC[0] += result[2] * alpha; \
rowC = (v2sf_t *) &CO[6* ldc+J]; \
rowC[0] += result[4] * alpha; \
rowC = (v2sf_t *) &CO[7* ldc+J]; \ rowC = (v2sf_t *) &CO[7* ldc+J]; \
rowC[0] += result[0] * alpha; rowC[0] += result[6] * alpha;
#define MMA __builtin_mma_xvbf16ger2pp #define MMA __builtin_mma_xvbf16ger2pp
#define SAVE2x4_ACC(ACC, J) \ #define SAVE2x4_ACC(ACC, J) \
__builtin_mma_disassemble_acc (result, ACC); \ __builtin_mma_disassemble_acc ((void *)result, ACC); \
rowC = (v4sf_t *) &CO[0* ldc+J]; \ rowC = (v4sf_t *) &CO[0* ldc+J]; \
rowC[0] += result[3] * alpha; \ rowC[0] += result[0] * alpha; \
rowC = (v4sf_t *) &CO[1* ldc+J]; \ rowC = (v4sf_t *) &CO[1* ldc+J]; \
rowC[0] += result[2] * alpha; rowC[0] += result[1] * alpha;
#define SET_ACC_ZERO4() \ #define SET_ACC_ZERO4() \
__builtin_mma_xxsetaccz (&acc0); \ __builtin_mma_xxsetaccz (&acc0); \

View File

@ -40,8 +40,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma GCC optimize "O1" #pragma GCC optimize "O1"
#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(POWER8) || defined(POWER9) || defined(POWER10)
#if defined(__VEC__) || defined(__ALTIVEC__)
#include "srot_microk_power8.c" #include "srot_microk_power8.c"
#endif #endif
#endif
#ifndef HAVE_KERNEL_16 #ifndef HAVE_KERNEL_16

View File

@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(POWER8) || defined(POWER9) || defined(POWER10)
#if defined(__VEC__) || defined(__ALTIVEC__)
#include "sscal_microk_power8.c" #include "sscal_microk_power8.c"
#endif #endif
#endif
#if !defined(HAVE_KERNEL_16) #if !defined(HAVE_KERNEL_16)

View File

@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(POWER8) || defined(POWER9) || defined(POWER10)
#if defined(__VEC__) || defined(__ALTIVEC__)
#include "sswap_microk_power8.c" #include "sswap_microk_power8.c"
#endif #endif
#endif
#ifndef HAVE_KERNEL_32 #ifndef HAVE_KERNEL_32

View File

@ -47,8 +47,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(POWER8) || defined(POWER9) || defined(POWER10)
#if defined(__VEC__) || defined(__ALTIVEC__)
#include "zasum_microk_power8.c" #include "zasum_microk_power8.c"
#endif #endif
#endif
#ifndef HAVE_KERNEL_8 #ifndef HAVE_KERNEL_8

View File

@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(POWER8) || defined(POWER9) || defined(POWER10)
#if defined(__VEC__) || defined(__ALTIVEC__)
#include "zaxpy_microk_power8.c" #include "zaxpy_microk_power8.c"
#endif #endif
#endif
#ifndef HAVE_KERNEL_4 #ifndef HAVE_KERNEL_4

View File

@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(POWER8) || defined(POWER9) || defined(POWER10)
#if defined(__VEC__) || defined(__ALTIVEC__)
#include "zcopy_microk_power8.c" #include "zcopy_microk_power8.c"
#endif #endif
#endif
#ifndef HAVE_KERNEL_16 #ifndef HAVE_KERNEL_16

View File

@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(POWER8) || defined(POWER9) || defined(POWER10)
#if defined(__VEC__) || defined(__ALTIVEC__)
#include "zdot_microk_power8.c" #include "zdot_microk_power8.c"
#endif #endif
#endif
#ifndef HAVE_KERNEL_8 #ifndef HAVE_KERNEL_8
@ -93,9 +95,11 @@ FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG in
FLOAT dot[4] = { 0.0, 0.0, 0.0 , 0.0 } ; FLOAT dot[4] = { 0.0, 0.0, 0.0 , 0.0 } ;
if ( n <= 0 ) if ( n <= 0 )
{ { /*
__real__ result = 0.0 ; __real__ result = 0.0 ;
__imag__ result = 0.0 ; __imag__ result = 0.0 ;
*/
result = OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0);
return(result); return(result);
} }
@ -149,11 +153,17 @@ FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG in
} }
#if !defined(CONJ) #if !defined(CONJ)
/*
__real__ result = dot[0] - dot[1]; __real__ result = dot[0] - dot[1];
__imag__ result = dot[2] + dot[3]; __imag__ result = dot[2] + dot[3];
*/
result = OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]-dot[1],dot[2]+dot[3]);
#else #else
/*
__real__ result = dot[0] + dot[1]; __real__ result = dot[0] + dot[1];
__imag__ result = dot[2] - dot[3]; __imag__ result = dot[2] - dot[3];
*/
result = OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]+dot[1],dot[2]-dot[3]);
#endif #endif

View File

@ -29,6 +29,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <stdio.h> #include <stdio.h>
#include "common.h" #include "common.h"
#if defined(__VEC__) || defined(__ALTIVEC__)
#define HAVE_KERNEL_4x4_VEC 1 #define HAVE_KERNEL_4x4_VEC 1
#define HAVE_KERNEL_4x2_VEC 1 #define HAVE_KERNEL_4x2_VEC 1
#define HAVE_KERNEL_4x1_VEC 1 #define HAVE_KERNEL_4x1_VEC 1
@ -37,6 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC) #if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC)
#include <altivec.h> #include <altivec.h>
#endif #endif
#endif
// //
#define NBMAX 4096 #define NBMAX 4096

View File

@ -28,10 +28,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#define NBMAX 4096 #define NBMAX 4096
#if defined(__VEC__) || defined(__ALTIVEC__)
#define HAVE_KERNEL_4x4_VEC 1 #define HAVE_KERNEL_4x4_VEC 1
#define HAVE_KERNEL_4x2_VEC 1 #define HAVE_KERNEL_4x2_VEC 1
#define HAVE_KERNEL_4x1_VEC 1 #define HAVE_KERNEL_4x1_VEC 1
#endif
#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC) #if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC)
#include <altivec.h> #include <altivec.h>
#endif #endif

View File

@ -24,6 +24,9 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#if !defined(__VEC__) || !defined(__ALTIVEC__)
#include "../arm/zrot.c"
#else
#include "common.h" #include "common.h"
@ -262,4 +265,4 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
} }
#endif

View File

@ -39,10 +39,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma GCC optimize "O1" #pragma GCC optimize "O1"
#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(POWER8) || defined(POWER9) || defined(POWER10)
#if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(DOUBLE) #if defined(DOUBLE)
#include "zscal_microk_power8.c" #include "zscal_microk_power8.c"
#endif #endif
#endif #endif
#endif
#ifndef HAVE_KERNEL_8 #ifndef HAVE_KERNEL_8

View File

@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(POWER8) || defined(POWER9) || defined(POWER10)
#if defined(__VEC__) || defined(__ALTIVEC__)
#include "zswap_microk_power8.c" #include "zswap_microk_power8.c"
#endif #endif
#endif
#ifndef HAVE_KERNEL_16 #ifndef HAVE_KERNEL_16

View File

@ -1,667 +0,0 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include "common.h"
#ifndef USE_SIMPLE_THREADED_LEVEL3
//The array of job_t may overflow the stack.
//Instead, use malloc to alloc job_t.
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
#define USE_ALLOC_HEAP
#endif
static FLOAT dm1 = -1.;
#ifndef KERNEL_FUNC
#ifndef LOWER
#define KERNEL_FUNC SYRK_KERNEL_U
#else
#define KERNEL_FUNC SYRK_KERNEL_L
#endif
#endif
#ifndef LOWER
#ifndef COMPLEX
#define TRSM_KERNEL TRSM_KERNEL_LT
#else
#define TRSM_KERNEL TRSM_KERNEL_LC
#endif
#else
#ifndef COMPLEX
#define TRSM_KERNEL TRSM_KERNEL_RN
#else
#define TRSM_KERNEL TRSM_KERNEL_RR
#endif
#endif
#ifndef CACHE_LINE_SIZE
#define CACHE_LINE_SIZE 8
#endif
#ifndef DIVIDE_RATE
#define DIVIDE_RATE 2
#endif
#ifndef SWITCH_RATIO
#define SWITCH_RATIO 2
#endif
#ifndef LOWER
#define TRANS
#endif
#ifndef SYRK_LOCAL
#if !defined(LOWER) && !defined(TRANS)
#define SYRK_LOCAL SYRK_UN
#elif !defined(LOWER) && defined(TRANS)
#define SYRK_LOCAL SYRK_UT
#elif defined(LOWER) && !defined(TRANS)
#define SYRK_LOCAL SYRK_LN
#else
#define SYRK_LOCAL SYRK_LT
#endif
#endif
typedef struct {
#ifdef HAVE_C11
_Atomic
#else
volatile
#endif
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
} job_t;
#ifndef KERNEL_OPERATION
#ifndef COMPLEX
#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \
KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y))
#else
#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \
KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y))
#endif
#endif
#ifndef ICOPY_OPERATION
#ifndef TRANS
#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
#else
#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
#endif
#endif
#ifndef OCOPY_OPERATION
#ifdef TRANS
#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER);
#else
#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER);
#endif
#endif
#ifndef S
#define S args -> a
#endif
#ifndef A
#define A args -> b
#endif
#ifndef C
#define C args -> c
#endif
#ifndef LDA
#define LDA args -> lda
#endif
#ifndef N
#define N args -> m
#endif
#ifndef K
#define K args -> k
#endif
static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){
FLOAT *buffer[DIVIDE_RATE];
BLASLONG k, lda;
BLASLONG m_from, m_to;
FLOAT *alpha;
FLOAT *a, *c;
job_t *job = (job_t *)args -> common;
BLASLONG xxx, bufferside;
BLASLONG jjs, min_jj;
BLASLONG is, min_i, div_n;
BLASLONG i, current;
k = K;
a = (FLOAT *)A;
c = (FLOAT *)C;
lda = LDA;
alpha = (FLOAT *)args -> alpha;
m_from = range_n[mypos + 0];
m_to = range_n[mypos + 1];
#if 0
fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld\n", mypos, m_from, m_to);
#endif
div_n = (((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
buffer[0] = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
for (i = 1; i < DIVIDE_RATE; i++) {
buffer[i] = buffer[i - 1] + GEMM_Q * div_n * COMPSIZE;
}
#ifndef LOWER
TRSM_IUNCOPY(k, k, (FLOAT *)S, lda, 0, sb);
#else
TRSM_OLTCOPY(k, k, (FLOAT *)S, lda, 0, sb);
#endif
for (xxx = m_from, bufferside = 0; xxx < m_to; xxx += div_n, bufferside ++) {
for(jjs = xxx; jjs < MIN(m_to, xxx + div_n); jjs += min_jj){
min_jj = MIN(m_to, xxx + div_n) - jjs;
#ifndef LOWER
if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN;
#else
if (min_jj > GEMM_P) min_jj = GEMM_P;
#endif
#ifndef LOWER
OCOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE);
TRSM_KERNEL (k, min_jj, k, dm1,
#ifdef COMPLEX
ZERO,
#endif
sb,
buffer[bufferside] + k * (jjs - xxx) * COMPSIZE,
a + jjs * lda * COMPSIZE, lda, 0);
#else
ICOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE);
TRSM_KERNEL (min_jj, k, k, dm1,
#ifdef COMPLEX
ZERO,
#endif
buffer[bufferside] + k * (jjs - xxx) * COMPSIZE,
sb,
a + jjs * COMPSIZE, lda, 0);
#endif
}
#ifndef LOWER
for (i = 0; i <= mypos; i++)
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
#else
for (i = mypos; i < args -> nthreads; i++)
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
#endif
WMB;
}
min_i = m_to - m_from;
if (min_i >= GEMM_P * 2) {
min_i = GEMM_P;
} else
if (min_i > GEMM_P) {
min_i = (((min_i + 1) / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
}
#ifndef LOWER
ICOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa);
#else
OCOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa);
#endif
current = mypos;
#ifndef LOWER
while (current < args -> nthreads)
#else
while (current >= 0)
#endif
{
div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
/* thread has to wait */
if (current != mypos) while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha,
sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
c, lda, m_from, xxx);
if (m_from + min_i >= m_to) {
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
WMB;
}
}
#ifndef LOWER
current ++;
#else
current --;
#endif
}
for(is = m_from + min_i; is < m_to; is += min_i){
min_i = m_to - is;
if (min_i >= GEMM_P * 2) {
min_i = GEMM_P;
} else
if (min_i > GEMM_P) {
min_i = (((min_i + 1) / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
}
#ifndef LOWER
ICOPY_OPERATION(k, min_i, a, lda, 0, is, sa);
#else
OCOPY_OPERATION(k, min_i, a, lda, 0, is, sa);
#endif
current = mypos;
#ifndef LOWER
while (current < args -> nthreads)
#else
while (current >= 0)
#endif
{
div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha,
sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
c, lda, is, xxx);
if (is + min_i >= m_to) {
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
WMB;
}
}
#ifndef LOWER
current ++;
#else
current --;
#endif
}
}
for (i = 0; i < args -> nthreads; i++) {
if (i != mypos) {
for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;};
}
}
}
return 0;
}
static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){
blas_arg_t newarg;
#ifndef USE_ALLOC_HEAP
job_t job[MAX_CPU_NUMBER];
#else
job_t * job = NULL;
#endif
blas_queue_t queue[MAX_CPU_NUMBER];
BLASLONG range[MAX_CPU_NUMBER + 100];
BLASLONG num_cpu;
BLASLONG nthreads = args -> nthreads;
BLASLONG width, i, j, k;
BLASLONG n, n_from, n_to;
int mode, mask;
double dnum;
#ifndef COMPLEX
#ifdef XDOUBLE
mode = BLAS_XDOUBLE | BLAS_REAL;
mask = MAX(QGEMM_UNROLL_M, QGEMM_UNROLL_N) - 1;
#elif defined(DOUBLE)
mode = BLAS_DOUBLE | BLAS_REAL;
mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1;
#elif defined(HALF)
mode = BLAS_HALF | BLAS_REAL;
mask = MAX(SHGEMM_UNROLL_M, SHGEMM_UNROLL_N) - 1;
#else
mode = BLAS_SINGLE | BLAS_REAL;
mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1;
#endif
#else
#ifdef XDOUBLE
mode = BLAS_XDOUBLE | BLAS_COMPLEX;
mask = MAX(XGEMM_UNROLL_M, XGEMM_UNROLL_N) - 1;
#elif defined(DOUBLE)
mode = BLAS_DOUBLE | BLAS_COMPLEX;
mask = MAX(ZGEMM_UNROLL_M, ZGEMM_UNROLL_N) - 1;
#else
mode = BLAS_SINGLE | BLAS_COMPLEX;
mask = MAX(CGEMM_UNROLL_M, CGEMM_UNROLL_N) - 1;
#endif
#endif
newarg.m = args -> m;
newarg.k = args -> k;
newarg.a = args -> a;
newarg.b = args -> b;
newarg.c = args -> c;
newarg.lda = args -> lda;
newarg.alpha = args -> alpha;
#ifdef USE_ALLOC_HEAP
job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
if(job==NULL){
fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
exit(1);
}
#endif
newarg.common = (void *)job;
n_from = 0;
n_to = args -> m;
#ifndef LOWER
range[MAX_CPU_NUMBER] = n_to - n_from;
range[0] = 0;
num_cpu = 0;
i = 0;
n = n_to - n_from;
dnum = (double)n * (double)n /(double)nthreads;
while (i < n){
if (nthreads - num_cpu > 1) {
double di = (double)i;
width = ((((BLASLONG)(sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1));
if (num_cpu == 0) width = n - (((n - width)/(mask+1)) * (mask+1));
if ((width > n - i) || (width < mask)) width = n - i;
} else {
width = n - i;
}
range[MAX_CPU_NUMBER - num_cpu - 1] = range[MAX_CPU_NUMBER - num_cpu] - width;
queue[num_cpu].mode = mode;
queue[num_cpu].routine = inner_thread;
queue[num_cpu].args = &newarg;
queue[num_cpu].range_m = NULL;
queue[num_cpu].sa = NULL;
queue[num_cpu].sb = NULL;
queue[num_cpu].next = &queue[num_cpu + 1];
num_cpu ++;
i += width;
}
for (i = 0; i < num_cpu; i ++) queue[i].range_n = &range[MAX_CPU_NUMBER - num_cpu];
#else
range[0] = 0;
num_cpu = 0;
i = 0;
n = n_to - n_from;
dnum = (double)n * (double)n /(double)nthreads;
while (i < n){
if (nthreads - num_cpu > 1) {
double di = (double)i;
width = ((((BLASLONG)(sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1));
if ((width > n - i) || (width < mask)) width = n - i;
} else {
width = n - i;
}
range[num_cpu + 1] = range[num_cpu] + width;
queue[num_cpu].mode = mode;
queue[num_cpu].routine = inner_thread;
queue[num_cpu].args = &newarg;
queue[num_cpu].range_m = NULL;
queue[num_cpu].range_n = range;
queue[num_cpu].sa = NULL;
queue[num_cpu].sb = NULL;
queue[num_cpu].next = &queue[num_cpu + 1];
num_cpu ++;
i += width;
}
#endif
newarg.nthreads = num_cpu;
if (num_cpu) {
for (j = 0; j < num_cpu; j++) {
for (i = 0; i < num_cpu; i++) {
for (k = 0; k < DIVIDE_RATE; k++) {
job[j].working[i][CACHE_LINE_SIZE * k] = 0;
}
}
}
queue[0].sa = sa;
queue[0].sb = sb;
queue[num_cpu - 1].next = NULL;
exec_blas(num_cpu, queue);
}
#ifdef USE_ALLOC_HEAP
free(job);
#endif
return 0;
}
#endif
blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) {
BLASLONG n, bk, i, blocking, lda;
BLASLONG info;
int mode;
blas_arg_t newarg;
FLOAT *a;
FLOAT alpha[2] = { -ONE, ZERO};
#ifndef COMPLEX
#ifdef XDOUBLE
mode = BLAS_XDOUBLE | BLAS_REAL;
#elif defined(DOUBLE)
mode = BLAS_DOUBLE | BLAS_REAL;
#else
mode = BLAS_SINGLE | BLAS_REAL;
#endif
#else
#ifdef XDOUBLE
mode = BLAS_XDOUBLE | BLAS_COMPLEX;
#elif defined(DOUBLE)
mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
mode = BLAS_SINGLE | BLAS_COMPLEX;
#endif
#endif
if (args -> nthreads == 1) {
#ifndef LOWER
info = POTRF_U_SINGLE(args, NULL, NULL, sa, sb, 0);
#else
info = POTRF_L_SINGLE(args, NULL, NULL, sa, sb, 0);
#endif
return info;
}
n = args -> n;
a = (FLOAT *)args -> a;
lda = args -> lda;
if (range_n) n = range_n[1] - range_n[0];
if (n <= GEMM_UNROLL_N * 2) {
#ifndef LOWER
info = POTRF_U_SINGLE(args, NULL, range_n, sa, sb, 0);
#else
info = POTRF_L_SINGLE(args, NULL, range_n, sa, sb, 0);
#endif
return info;
}
newarg.lda = lda;
newarg.ldb = lda;
newarg.ldc = lda;
newarg.alpha = alpha;
newarg.beta = NULL;
newarg.nthreads = args -> nthreads;
blocking = ((n / 2 + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N;
if (blocking > GEMM_Q) blocking = GEMM_Q;
for (i = 0; i < n; i += blocking) {
bk = n - i;
if (bk > blocking) bk = blocking;
newarg.m = bk;
newarg.n = bk;
newarg.a = a + (i + i * lda) * COMPSIZE;
info = CNAME(&newarg, NULL, NULL, sa, sb, 0);
if (info) return info + i;
if (n - i - bk > 0) {
#ifndef USE_SIMPLE_THREADED_LEVEL3
newarg.m = n - i - bk;
newarg.k = bk;
#ifndef LOWER
newarg.b = a + ( i + (i + bk) * lda) * COMPSIZE;
#else
newarg.b = a + ((i + bk) + i * lda) * COMPSIZE;
#endif
newarg.c = a + ((i + bk) + (i + bk) * lda) * COMPSIZE;
thread_driver(&newarg, sa, sb);
#else
#ifndef LOWER
newarg.m = bk;
newarg.n = n - i - bk;
newarg.a = a + (i + i * lda) * COMPSIZE;
newarg.b = a + (i + (i + bk) * lda) * COMPSIZE;
gemm_thread_n(mode | BLAS_TRANSA_T,
&newarg, NULL, NULL, (void *)TRSM_LCUN, sa, sb, args -> nthreads);
newarg.n = n - i - bk;
newarg.k = bk;
newarg.a = a + ( i + (i + bk) * lda) * COMPSIZE;
newarg.c = a + ((i + bk) + (i + bk) * lda) * COMPSIZE;
#if 0
HERK_THREAD_UC(&newarg, NULL, NULL, sa, sb, 0);
#else
syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T,
&newarg, NULL, NULL, (void *)HERK_UC, sa, sb, args -> nthreads);
#endif
#else
newarg.m = n - i - bk;
newarg.n = bk;
newarg.a = a + (i + i * lda) * COMPSIZE;
newarg.b = a + (i + bk + i * lda) * COMPSIZE;
gemm_thread_m(mode | BLAS_RSIDE | BLAS_TRANSA_T | BLAS_UPLO,
&newarg, NULL, NULL, (void *)TRSM_RCLN, sa, sb, args -> nthreads);
newarg.n = n - i - bk;
newarg.k = bk;
newarg.a = a + (i + bk + i * lda) * COMPSIZE;
newarg.c = a + (i + bk + (i + bk) * lda) * COMPSIZE;
#if 0
HERK_THREAD_LN(&newarg, NULL, NULL, sa, sb, 0);
#else
syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T | BLAS_UPLO,
&newarg, NULL, NULL, (void *)HERK_LN, sa, sb, args -> nthreads);
#endif
#endif
#endif
}
}
return 0;
}

View File

@ -101,7 +101,12 @@ static FLOAT dm1 = -1.;
#endif #endif
typedef struct { typedef struct {
volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; #ifdef HAVE_C11
_Atomic
#else
volatile
#endif
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
} job_t; } job_t;
@ -375,6 +380,9 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){
#elif defined(DOUBLE) #elif defined(DOUBLE)
mode = BLAS_DOUBLE | BLAS_REAL; mode = BLAS_DOUBLE | BLAS_REAL;
mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1; mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1;
#elif defined(HALF)
mode = BLAS_HALF | BLAS_REAL;
mask = MAX(SHGEMM_UNROLL_M, SHGEMM_UNROLL_N) - 1;
#else #else
mode = BLAS_SINGLE | BLAS_REAL; mode = BLAS_SINGLE | BLAS_REAL;
mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1; mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1;

43
param.h
View File

@ -2225,7 +2225,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 65536 #define GEMM_DEFAULT_OFFSET_B 65536
#define GEMM_DEFAULT_ALIGN 0x0ffffUL #define GEMM_DEFAULT_ALIGN 0x0ffffUL
#if defined(__32BIT__)
#warning using BINARY32==POWER6
#define SGEMM_DEFAULT_UNROLL_M 4
#define SGEMM_DEFAULT_UNROLL_N 4
#define DGEMM_DEFAULT_UNROLL_M 4
#define DGEMM_DEFAULT_UNROLL_N 4
#define CGEMM_DEFAULT_UNROLL_M 2
#define CGEMM_DEFAULT_UNROLL_N 4
#define ZGEMM_DEFAULT_UNROLL_M 2
#define ZGEMM_DEFAULT_UNROLL_N 4
#else
#define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_M 16
#define SGEMM_DEFAULT_UNROLL_N 8 #define SGEMM_DEFAULT_UNROLL_N 8
#define DGEMM_DEFAULT_UNROLL_M 16 #define DGEMM_DEFAULT_UNROLL_M 16
@ -2234,7 +2244,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CGEMM_DEFAULT_UNROLL_N 4 #define CGEMM_DEFAULT_UNROLL_N 4
#define ZGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_M 8
#define ZGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2
#endif
#define SGEMM_DEFAULT_P 1280UL #define SGEMM_DEFAULT_P 1280UL
#define DGEMM_DEFAULT_P 640UL #define DGEMM_DEFAULT_P 640UL
#define CGEMM_DEFAULT_P 640UL #define CGEMM_DEFAULT_P 640UL
@ -2769,6 +2779,35 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
#define CGEMM_DEFAULT_R 4096 #define CGEMM_DEFAULT_R 4096
#define ZGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096
#elif defined(THUNDERX3T110)
#define SGEMM_DEFAULT_UNROLL_M 16
#define SGEMM_DEFAULT_UNROLL_N 4
#define DGEMM_DEFAULT_UNROLL_M 8
#define DGEMM_DEFAULT_UNROLL_N 4
#define CGEMM_DEFAULT_UNROLL_M 8
#define CGEMM_DEFAULT_UNROLL_N 4
#define ZGEMM_DEFAULT_UNROLL_M 4
#define ZGEMM_DEFAULT_UNROLL_N 4
#define SGEMM_DEFAULT_P 128
#define DGEMM_DEFAULT_P 320
#define CGEMM_DEFAULT_P 128
#define ZGEMM_DEFAULT_P 128
#define SGEMM_DEFAULT_Q 352
#define DGEMM_DEFAULT_Q 128
#define CGEMM_DEFAULT_Q 224
#define ZGEMM_DEFAULT_Q 112
#define SGEMM_DEFAULT_R 4096
#define DGEMM_DEFAULT_R 4096
#define CGEMM_DEFAULT_R 4096
#define ZGEMM_DEFAULT_R 4096
#elif defined(NEOVERSEN1) #elif defined(NEOVERSEN1)
#define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_M 16