diff --git a/Makefile.arm64 b/Makefile.arm64 index a7cd82e3a..1091edfe5 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -56,6 +56,16 @@ CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 endif +ifeq ($(CORE), THUNDERX3T110) +ifeq ($(GCCVERSIONGTEQ10), 1) +CCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110 +FCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110 +else +CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 +FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 +endif +endif + ifeq ($(GCCVERSIONGTEQ9), 1) ifeq ($(CORE), TSV110) CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 diff --git a/Makefile.system b/Makefile.system index db651ef99..d7e71d00a 100644 --- a/Makefile.system +++ b/Makefile.system @@ -578,6 +578,7 @@ DYNAMIC_CORE += THUNDERX DYNAMIC_CORE += THUNDERX2T99 DYNAMIC_CORE += TSV110 DYNAMIC_CORE += EMAG8180 +DYNAMIC_CORE += THUNDERX3T110 endif ifeq ($(ARCH), zarch) diff --git a/TargetList.txt b/TargetList.txt index 4e54e3077..8ea2df9b7 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -96,6 +96,7 @@ FALKOR THUNDERX THUNDERX2T99 TSV110 +THUNDERX3T110 9.System Z: ZARCH_GENERIC diff --git a/cmake/arch.cmake b/cmake/arch.cmake index d56ba99cb..5388156bc 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -45,7 +45,7 @@ endif () if (DYNAMIC_ARCH) if (ARM64) - set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1) + set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) endif () if (POWER) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 30256870c..e50483a2f 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -338,6 +338,33 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_N 4) set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "THUNDERX3T110") + file(APPEND ${TARGET_CONF_TEMP} + "#define THUNDERX3T110\n" + "#define L1_CODE_SIZE\t65536\n" + "#define L1_CODE_LINESIZE\t64\n" + "#define L1_CODE_ASSOCIATIVE\t8\n" + "#define L1_DATA_SIZE\t65536\n" + "#define L1_DATA_LINESIZE\t64\n" + "#define L1_DATA_ASSOCIATIVE\t8\n" + "#define L2_SIZE\t524288\n" + "#define L2_LINESIZE\t64\n" + "#define L2_ASSOCIATIVE\t8\n" + "#define L3_SIZE\t94371840\n" + "#define L3_LINESIZE\t64\n" + "#define L3_ASSOCIATIVE\t32\n" + "#define DTB_DEFAULT_ENTRIES\t64\n" + "#define DTB_SIZE\t4096\n" + "#define ARMV8\n") + set(SGEMM_UNROLL_M 16) + set(SGEMM_UNROLL_N 4) + set(DGEMM_UNROLL_M 8) + set(DGEMM_UNROLL_N 4) + set(CGEMM_UNROLL_M 8) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 4) + set(ZGEMM_UNROLL_N 4) + set(SYMV_P 16) elseif ("${TCORE}" STREQUAL "TSV110") file(APPEND ${TARGET_CONF_TEMP} "#define ARMV8\n" diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 4103216e6..6f41be604 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -40,6 +40,7 @@ // Cavium #define CPU_THUNDERX 7 #define CPU_THUNDERX2T99 8 +#define CPU_THUNDERX3T110 12 //Hisilicon #define CPU_TSV110 9 // Ampere @@ -57,7 +58,8 @@ static char *cpuname[] = { "THUNDERX2T99", "TSV110", "EMAG8180", - "NEOVERSEN1" + "NEOVERSEN1", + "THUNDERX3T110" }; static char *cpuname_lower[] = { @@ -72,7 +74,8 @@ static char *cpuname_lower[] = { "thunderx2t99", "tsv110", "emag8180", - "neoversen1" + "neoversen1", + "thunderx3t110" }; int get_feature(char *search) @@ -158,6 +161,8 @@ int detect(void) return CPU_THUNDERX; else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0af")) return CPU_THUNDERX2T99; + else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0b8")) + return CPU_THUNDERX3T110; // HiSilicon else if (strstr(cpu_implementer, "0x48") && strstr(cpu_part, "0xd01")) return CPU_TSV110; @@ -372,7 +377,25 @@ void get_cpuconfig(void) printf("#define L2_LINESIZE 64\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_SIZE 4096\n"); + break; + case CPU_THUNDERX3T110: + printf("#define THUNDERX3T110 \n"); + printf("#define L1_CODE_SIZE 65536 \n"); + printf("#define L1_CODE_LINESIZE 64 \n"); + printf("#define L1_CODE_ASSOCIATIVE 8 \n"); + printf("#define L1_DATA_SIZE 32768 \n"); + printf("#define L1_DATA_LINESIZE 64 \n"); + printf("#define L1_DATA_ASSOCIATIVE 8 \n"); + printf("#define L2_SIZE 524288 \n"); + printf("#define L2_LINESIZE 64 \n"); + printf("#define L2_ASSOCIATIVE 8 \n"); + printf("#define L3_SIZE 94371840 \n"); + printf("#define L3_LINESIZE 64 \n"); + printf("#define L3_ASSOCIATIVE 32 \n"); + printf("#define DTB_DEFAULT_ENTRIES 64 \n"); + printf("#define DTB_SIZE 4096 \n"); + break; } get_cpucount(); } diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 11ef2725c..157b03365 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -53,10 +53,11 @@ extern gotoblas_t gotoblas_THUNDERX2T99; extern gotoblas_t gotoblas_TSV110; extern gotoblas_t gotoblas_EMAG8180; extern gotoblas_t gotoblas_NEOVERSEN1; +extern gotoblas_t gotoblas_THUNDERX3T110; extern void openblas_warning(int verbose, const char * msg); -#define NUM_CORETYPES 11 +#define NUM_CORETYPES 12 /* * In case asm/hwcap.h is outdated on the build system, make sure @@ -82,6 +83,7 @@ static char *corename[] = { "tsv110", "emag8180", "neoversen1", + "thunderx3t110", "unknown" }; @@ -97,6 +99,7 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_TSV110) return corename[ 8]; if (gotoblas == &gotoblas_EMAG8180) return corename[ 9]; if (gotoblas == &gotoblas_NEOVERSEN1) return corename[10]; + if (gotoblas == &gotoblas_THUNDERX3T110) return corename[11]; return corename[NUM_CORETYPES]; } @@ -127,6 +130,7 @@ static gotoblas_t *force_coretype(char *coretype) { case 8: return (&gotoblas_TSV110); case 9: return (&gotoblas_EMAG8180); case 10: return (&gotoblas_NEOVERSEN1); + case 11: return (&gotoblas_THUNDERX3T110); } snprintf(message, 128, "Core not found: %s\n", coretype); openblas_warning(1, message); @@ -190,6 +194,8 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_THUNDERX; case 0x0af: // ThunderX2 return &gotoblas_THUNDERX2T99; + case 0x0b8: // ThunderX3 + return &gotoblas_THUNDERX3T110; } break; case 0x48: // HiSilicon diff --git a/getarch.c b/getarch.c index 2cdf77259..51c9a84e5 100644 --- a/getarch.c +++ b/getarch.c @@ -1174,6 +1174,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "EMAG8180" #endif +#ifdef FORCE_THUNDERX3T110 +#define ARMV8 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "THUNDERX3T110" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DTHUNDERX3T110 " \ + "-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \ + "-DL2_SIZE=524288 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \ + "-DL3_SIZE=94371840 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" +#define LIBNAME "thunderx3t110" +#define CORENAME "THUNDERX3T110" +#else +#endif + #ifdef FORCE_ZARCH_GENERIC #define FORCE #define ARCHITECTURE "ZARCH" diff --git a/interface/swap.c b/interface/swap.c index 17a9868a9..ea40b1fc2 100644 --- a/interface/swap.c +++ b/interface/swap.c @@ -42,7 +42,7 @@ #include "functable.h" #endif -#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) +#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) || defined(THUNDERX3T110) // Multithreaded swap gives performance benefits in ThunderX2T99 #else // Disable multi-threading as it does not show any performance diff --git a/interface/zswap.c b/interface/zswap.c index 372b15447..43971b73e 100644 --- a/interface/zswap.c +++ b/interface/zswap.c @@ -42,7 +42,7 @@ #include "functable.h" #endif -#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) +#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) || defined(THUNDERX3T110) // Multithreaded swap gives performance benefits in ThunderX2T99 #else // Disable multi-threading as it does not show any performance diff --git a/kernel/arm64/KERNEL.THUNDERX3T110 b/kernel/arm64/KERNEL.THUNDERX3T110 new file mode 100644 index 000000000..a20d0d4a6 --- /dev/null +++ b/kernel/arm64/KERNEL.THUNDERX3T110 @@ -0,0 +1,184 @@ +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = daxpy_thunderx2t99.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) + +ifeq ($(DGEMM_UNROLL_M), 8) +DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S +DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S +else +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +endif + +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +ifeq ($(DGEMM_UNROLL_N), 4) +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S +else +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +endif + +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +SASUMKERNEL = sasum_thunderx2t99.c +DASUMKERNEL = dasum_thunderx2t99.c +CASUMKERNEL = casum_thunderx2t99.c +ZASUMKERNEL = zasum_thunderx2t99.c + +SCOPYKERNEL = copy_thunderx2t99.c +DCOPYKERNEL = copy_thunderx2t99.c +CCOPYKERNEL = copy_thunderx2t99.c +ZCOPYKERNEL = copy_thunderx2t99.c + +SSWAPKERNEL = swap_thunderx2t99.S +DSWAPKERNEL = swap_thunderx2t99.S +CSWAPKERNEL = swap_thunderx2t99.S +ZSWAPKERNEL = swap_thunderx2t99.S + +ISAMAXKERNEL = iamax_thunderx2t99.c +IDAMAXKERNEL = iamax_thunderx2t99.c +ICAMAXKERNEL = izamax_thunderx2t99.c +IZAMAXKERNEL = izamax_thunderx2t99.c + +SNRM2KERNEL = scnrm2_thunderx2t99.c +CNRM2KERNEL = scnrm2_thunderx2t99.c +#DNRM2KERNEL = dznrm2_thunderx2t99_fast.c +#ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c +DNRM2KERNEL = dznrm2_thunderx2t99.c +ZNRM2KERNEL = dznrm2_thunderx2t99.c + + +DDOTKERNEL = dot_thunderx2t99.c +SDOTKERNEL = dot_thunderx2t99.c +CDOTKERNEL = zdot_thunderx2t99.c +ZDOTKERNEL = zdot_thunderx2t99.c +DSDOTKERNEL = dot.S + +ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4) +DGEMMKERNEL = dgemm_kernel_8x4_thunderx2t99.S +endif + +ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 16x4) +SGEMMKERNEL = sgemm_kernel_16x4_thunderx2t99.S +endif + +ifeq ($(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N), 8x4) +CGEMMKERNEL = cgemm_kernel_8x4_thunderx2t99.S +endif + +ifeq ($(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N), 4x4) +ZGEMMKERNEL = zgemm_kernel_4x4_thunderx2t99.S +endif diff --git a/param.h b/param.h index efe0e1096..476f237a1 100644 --- a/param.h +++ b/param.h @@ -2779,6 +2779,35 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 +#elif defined(THUNDERX3T110) + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 4 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 320 +#define CGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P 128 + +#define SGEMM_DEFAULT_Q 352 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 112 + +#define SGEMM_DEFAULT_R 4096 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + #elif defined(NEOVERSEN1) #define SGEMM_DEFAULT_UNROLL_M 16