From 19c8f615dc507e20aee724aedb572bdddc2cd497 Mon Sep 17 00:00:00 2001 From: Sunita Nadampalli Date: Fri, 7 Jan 2022 00:28:17 +0000 Subject: [PATCH] OpenBLAS: aarch64: Add neoverse-v1/n2 architecture specifics --- Makefile.arm64 | 60 +++++++++++ Makefile.system | 3 + TargetList.txt | 2 + cmake/arch.cmake | 2 +- cmake/prebuild.cmake | 62 ++++++++++- cpuid_arm64.c | 44 +++++++- driver/others/dynamic_arm64.c | 2 + getarch.c | 37 ++++++- kernel/arm64/KERNEL.NEOVERSEN2 | 189 +++++++++++++++++++++++++++++++++ kernel/arm64/KERNEL.NEOVERSEV1 | 189 +++++++++++++++++++++++++++++++++ param.h | 58 ++++++++++ 11 files changed, 641 insertions(+), 7 deletions(-) create mode 100644 kernel/arm64/KERNEL.NEOVERSEN2 create mode 100644 kernel/arm64/KERNEL.NEOVERSEV1 diff --git a/Makefile.arm64 b/Makefile.arm64 index 801601030..2eade8d78 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -78,6 +78,66 @@ endif endif endif +# Use a72 tunings because Neoverse-V1 is only available +# in GCC>=9.4 +ifeq ($(CORE), NEOVERSEV1) +ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) +ifeq ($(GCCVERSIONGTEQ9), 1) +ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10))) +CCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1 +endif +else +CCOMMON_OPT += -march=armv8.4-a -mtune=native +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8.4-a -mtune=native +endif +endif +else +CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 +endif +endif +else +CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 +endif +endif +endif + +# Use a72 tunings because Neoverse-N2 is only available +# in GCC>=9.4 +ifeq ($(CORE), NEOVERSEN2) +ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) +ifeq ($(GCCVERSIONGTEQ9), 1) +ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10))) +CCOMMON_OPT += -march=armv8.5-a -mtune=neoverse-n2 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8.5-a -mtune=neoverse-n2 +endif +else +CCOMMON_OPT += -march=armv8.5-a -mtune=native +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8.5-a -mtune=native +endif +endif +else +CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 +endif +endif +else +CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 +endif +endif +endif + # Use a53 tunings because a55 is only available in GCC>=8.1 ifeq ($(CORE), CORTEXA55) ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) diff --git a/Makefile.system b/Makefile.system index 97fdc3f91..9203f49cb 100644 --- a/Makefile.system +++ b/Makefile.system @@ -374,6 +374,7 @@ else endif GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 1) GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 2) +GCCMINORVERSIONGTEQ4 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 4) GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7) endif @@ -654,6 +655,8 @@ DYNAMIC_CORE += CORTEXA57 DYNAMIC_CORE += CORTEXA72 DYNAMIC_CORE += CORTEXA73 DYNAMIC_CORE += NEOVERSEN1 +DYNAMIC_CORE += NEOVERSEV1 +DYNAMIC_CORE += NEOVERSEN2 DYNAMIC_CORE += CORTEXA55 DYNAMIC_CORE += FALKOR DYNAMIC_CORE += THUNDERX diff --git a/TargetList.txt b/TargetList.txt index b02a011d5..97c8a8f06 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -93,6 +93,8 @@ CORTEXA57 CORTEXA72 CORTEXA73 NEOVERSEN1 +NEOVERSEV1 +NEOVERSEN2 CORTEXA55 EMAG8180 FALKOR diff --git a/cmake/arch.cmake b/cmake/arch.cmake index d468eb60b..f4a135e82 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -44,7 +44,7 @@ endif () if (DYNAMIC_ARCH) if (ARM64) - set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) + set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 NEOVERSEV1 NEOVERSEN2 THUNDERX3T110) if (DYNAMIC_LIST) set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST}) endif () diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 259d9c738..5f12bb145 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -243,11 +243,11 @@ endif () "#define L1_CODE_ASSOCIATIVE\t4\n" "#define L1_DATA_SIZE\t65536\n" "#define L1_DATA_LINESIZE\t64\n" - "#define L1_DATA_ASSOCIATIVE\t2\n" + "#define L1_DATA_ASSOCIATIVE\t4\n" "#define L2_SIZE\t1048576\n\n" "#define L2_LINESIZE\t64\n" - "#define L2_ASSOCIATIVE\t16\n" - "#define DTB_DEFAULT_ENTRIES\t64\n" + "#define L2_ASSOCIATIVE\t8\n" + "#define DTB_DEFAULT_ENTRIES\t48\n" "#define DTB_SIZE\t4096\n" "#define HAVE_VFPV4\n" "#define HAVE_VFPV3\n" @@ -263,6 +263,62 @@ endif () set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_N 4) set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "NEOVERSEV1") + file(APPEND ${TARGET_CONF_TEMP} + "#define L1_CODE_SIZE\t65536\n" + "#define L1_CODE_LINESIZE\t64\n" + "#define L1_CODE_ASSOCIATIVE\t4\n" + "#define L1_DATA_SIZE\t65536\n" + "#define L1_DATA_LINESIZE\t64\n" + "#define L1_DATA_ASSOCIATIVE\t4\n" + "#define L2_SIZE\t1048576\n\n" + "#define L2_LINESIZE\t64\n" + "#define L2_ASSOCIATIVE\t8\n" + "#define DTB_DEFAULT_ENTRIES\t48\n" + "#define DTB_SIZE\t4096\n" + "#define HAVE_VFPV4\n" + "#define HAVE_VFPV3\n" + "#define HAVE_VFP\n" + "#define HAVE_NEON\n" + "#define HAVE_SVE\n" + "#define ARMV8\n") + set(SGEMM_UNROLL_M 16) + set(SGEMM_UNROLL_N 4) + set(DGEMM_UNROLL_M 8) + set(DGEMM_UNROLL_N 4) + set(CGEMM_UNROLL_M 8) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 4) + set(ZGEMM_UNROLL_N 4) + set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "NEOVERSEN2") + file(APPEND ${TARGET_CONF_TEMP} + "#define L1_CODE_SIZE\t65536\n" + "#define L1_CODE_LINESIZE\t64\n" + "#define L1_CODE_ASSOCIATIVE\t4\n" + "#define L1_DATA_SIZE\t65536\n" + "#define L1_DATA_LINESIZE\t64\n" + "#define L1_DATA_ASSOCIATIVE\t2\n" + "#define L2_SIZE\t1048576\n\n" + "#define L2_LINESIZE\t64\n" + "#define L2_ASSOCIATIVE\t8\n" + "#define DTB_DEFAULT_ENTRIES\t48\n" + "#define DTB_SIZE\t4096\n" + "#define HAVE_VFPV4\n" + "#define HAVE_VFPV3\n" + "#define HAVE_VFP\n" + "#define HAVE_NEON\n" + "#define HAVE_SVE\n" + "#define ARMV8\n") + set(SGEMM_UNROLL_M 16) + set(SGEMM_UNROLL_N 4) + set(DGEMM_UNROLL_M 8) + set(DGEMM_UNROLL_N 4) + set(CGEMM_UNROLL_M 8) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 4) + set(ZGEMM_UNROLL_N 4) + set(SYMV_P 16) elseif ("${TCORE}" STREQUAL "FALKOR") file(APPEND ${TARGET_CONF_TEMP} "#define L1_CODE_SIZE\t65536\n" diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 958e94abc..cc3a82815 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -43,6 +43,8 @@ size_t length64=sizeof(value64); #define CPU_CORTEXA72 4 #define CPU_CORTEXA73 5 #define CPU_NEOVERSEN1 11 +#define CPU_NEOVERSEV1 16 +#define CPU_NEOVERSEN2 17 // Qualcomm #define CPU_FALKOR 6 // Cavium @@ -71,6 +73,8 @@ static char *cpuname[] = { "TSV110", "EMAG8180", "NEOVERSEN1", + "NEOVERSEV1" + "NEOVERSEN2" "THUNDERX3T110", "VORTEX", "CORTEXA55", @@ -90,6 +94,8 @@ static char *cpuname_lower[] = { "tsv110", "emag8180", "neoversen1", + "neoversev1", + "neoversen2", "thunderx3t110", "vortex", "cortexa55", @@ -170,6 +176,10 @@ int detect(void) return CPU_CORTEXA73; else if (strstr(cpu_part, "0xd0c")) return CPU_NEOVERSEN1; + else if (strstr(cpu_part, "0xd40")) + return CPU_NEOVERSEV1; + else if (strstr(cpu_part, "0xd49")) + return CPU_NEOVERSEN2; else if (strstr(cpu_part, "0xd05")) return CPU_CORTEXA55; } @@ -338,11 +348,41 @@ void get_cpuconfig(void) printf("#define L1_DATA_ASSOCIATIVE 4\n"); printf("#define L2_SIZE 1048576\n"); printf("#define L2_LINESIZE 64\n"); - printf("#define L2_ASSOCIATIVE 16\n"); - printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define L2_ASSOCIATIVE 8\n"); + printf("#define DTB_DEFAULT_ENTRIES 48\n"); printf("#define DTB_SIZE 4096\n"); break; + case CPU_NEOVERSEV1: + printf("#define %s\n", cpuname[d]); + printf("#define L1_CODE_SIZE 65536\n"); + printf("#define L1_CODE_LINESIZE 64\n"); + printf("#define L1_CODE_ASSOCIATIVE 4\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L1_DATA_ASSOCIATIVE 4\n"); + printf("#define L2_SIZE 1048576\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define L2_ASSOCIATIVE 8\n"); + printf("#define DTB_DEFAULT_ENTRIES 48\n"); + printf("#define DTB_SIZE 4096\n"); + break; + + case CPU_NEOVERSEN2: + printf("#define %s\n", cpuname[d]); + printf("#define L1_CODE_SIZE 65536\n"); + printf("#define L1_CODE_LINESIZE 64\n"); + printf("#define L1_CODE_ASSOCIATIVE 4\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L1_DATA_ASSOCIATIVE 4\n"); + printf("#define L2_SIZE 1048576\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define L2_ASSOCIATIVE 8\n"); + printf("#define DTB_DEFAULT_ENTRIES 48\n"); + printf("#define DTB_SIZE 4096\n"); + break; + case CPU_FALKOR: printf("#define FALKOR\n"); printf("#define L1_CODE_SIZE 65536\n"); diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 04ceaaf6d..45ea9f113 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -147,6 +147,8 @@ static char *corename[] = { "tsv110", "emag8180", "neoversen1", + "neoversev1", + "neoversen2", "thunderx3t110", "cortexa55", "unknown" diff --git a/getarch.c b/getarch.c index 6063a2a1d..73bbf1892 100644 --- a/getarch.c +++ b/getarch.c @@ -1302,12 +1302,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8 " \ - "-march=armv8.2-a -mtune=cortex-a72" + "-march=armv8.2-a -mtune=neoverse-n1" #define LIBNAME "neoversen1" #define CORENAME "NEOVERSEN1" #else #endif +#ifdef FORCE_NEOVERSEV1 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "NEOVERSEV1" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DNEOVERSEV1 " \ + "-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 " \ + "-march=armv8.4-a -mtune=neoverse-v1" +#define LIBNAME "neoversev1" +#define CORENAME "NEOVERSEV1" +#else +#endif + + +#ifdef FORCE_NEOVERSEN2 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "NEOVERSEN2" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DNEOVERSEN2 " \ + "-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 " \ + "-march=armv8.5-a -mtune=neoverse-n2" +#define LIBNAME "neoversen2" +#define CORENAME "NEOVERSEN2" +#else +#endif + #ifdef FORCE_CORTEXA55 #define FORCE #define ARCHITECTURE "ARM64" diff --git a/kernel/arm64/KERNEL.NEOVERSEN2 b/kernel/arm64/KERNEL.NEOVERSEN2 new file mode 100644 index 000000000..ea010db42 --- /dev/null +++ b/kernel/arm64/KERNEL.NEOVERSEN2 @@ -0,0 +1,189 @@ +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = daxpy_thunderx2t99.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + + +SASUMKERNEL = sasum_thunderx2t99.c +DASUMKERNEL = dasum_thunderx2t99.c +CASUMKERNEL = casum_thunderx2t99.c +ZASUMKERNEL = zasum_thunderx2t99.c + +SCOPYKERNEL = copy_thunderx2t99.c +DCOPYKERNEL = copy_thunderx2t99.c +CCOPYKERNEL = copy_thunderx2t99.c +ZCOPYKERNEL = copy_thunderx2t99.c + +SSWAPKERNEL = swap_thunderx2t99.S +DSWAPKERNEL = swap_thunderx2t99.S +CSWAPKERNEL = swap_thunderx2t99.S +ZSWAPKERNEL = swap_thunderx2t99.S + +ISAMAXKERNEL = iamax_thunderx2t99.c +IDAMAXKERNEL = iamax_thunderx2t99.c +ICAMAXKERNEL = izamax_thunderx2t99.c +IZAMAXKERNEL = izamax_thunderx2t99.c + +SNRM2KERNEL = scnrm2_thunderx2t99.c +DNRM2KERNEL = dznrm2_thunderx2t99.c +CNRM2KERNEL = scnrm2_thunderx2t99.c +ZNRM2KERNEL = dznrm2_thunderx2t99.c + +DDOTKERNEL = dot_thunderx2t99.c +SDOTKERNEL = dot_thunderx2t99.c +CDOTKERNEL = zdot_thunderx2t99.c +ZDOTKERNEL = zdot_thunderx2t99.c +DSDOTKERNEL = dot.S + +DGEMM_BETA = dgemm_beta.S +SGEMM_BETA = sgemm_beta.S + +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +ifeq ($(SGEMM_UNROLL_M), 16) +SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S +else +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +endif +ifeq ($(SGEMM_UNROLL_M), 4) +SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S +else +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +endif +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ifeq ($(SGEMM_UNROLL_N), 16) +SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S +else +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +endif +ifeq ($(SGEMM_UNROLL_N), 4) +SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S +else +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +endif +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) + +ifeq ($(DGEMM_UNROLL_M), 8) +DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S +DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S +else +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +endif + +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +ifeq ($(DGEMM_UNROLL_N), 4) +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S +else +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +endif + +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/arm64/KERNEL.NEOVERSEV1 b/kernel/arm64/KERNEL.NEOVERSEV1 new file mode 100644 index 000000000..ea010db42 --- /dev/null +++ b/kernel/arm64/KERNEL.NEOVERSEV1 @@ -0,0 +1,189 @@ +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = daxpy_thunderx2t99.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + + +SASUMKERNEL = sasum_thunderx2t99.c +DASUMKERNEL = dasum_thunderx2t99.c +CASUMKERNEL = casum_thunderx2t99.c +ZASUMKERNEL = zasum_thunderx2t99.c + +SCOPYKERNEL = copy_thunderx2t99.c +DCOPYKERNEL = copy_thunderx2t99.c +CCOPYKERNEL = copy_thunderx2t99.c +ZCOPYKERNEL = copy_thunderx2t99.c + +SSWAPKERNEL = swap_thunderx2t99.S +DSWAPKERNEL = swap_thunderx2t99.S +CSWAPKERNEL = swap_thunderx2t99.S +ZSWAPKERNEL = swap_thunderx2t99.S + +ISAMAXKERNEL = iamax_thunderx2t99.c +IDAMAXKERNEL = iamax_thunderx2t99.c +ICAMAXKERNEL = izamax_thunderx2t99.c +IZAMAXKERNEL = izamax_thunderx2t99.c + +SNRM2KERNEL = scnrm2_thunderx2t99.c +DNRM2KERNEL = dznrm2_thunderx2t99.c +CNRM2KERNEL = scnrm2_thunderx2t99.c +ZNRM2KERNEL = dznrm2_thunderx2t99.c + +DDOTKERNEL = dot_thunderx2t99.c +SDOTKERNEL = dot_thunderx2t99.c +CDOTKERNEL = zdot_thunderx2t99.c +ZDOTKERNEL = zdot_thunderx2t99.c +DSDOTKERNEL = dot.S + +DGEMM_BETA = dgemm_beta.S +SGEMM_BETA = sgemm_beta.S + +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +ifeq ($(SGEMM_UNROLL_M), 16) +SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S +else +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +endif +ifeq ($(SGEMM_UNROLL_M), 4) +SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S +else +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +endif +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ifeq ($(SGEMM_UNROLL_N), 16) +SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S +else +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +endif +ifeq ($(SGEMM_UNROLL_N), 4) +SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S +else +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +endif +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) + +ifeq ($(DGEMM_UNROLL_M), 8) +DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S +DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S +else +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +endif + +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +ifeq ($(DGEMM_UNROLL_N), 4) +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S +else +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +endif + +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/param.h b/param.h index 8dd2a7461..eb4dcb8f0 100644 --- a/param.h +++ b/param.h @@ -3307,6 +3307,64 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 +#elif defined(NEOVERSEV1) + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 4 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 160 +#define CGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P 128 + +#define SGEMM_DEFAULT_Q 352 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 112 + +#define SGEMM_DEFAULT_R 4096 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + +#elif defined(NEOVERSEN2) + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 4 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 160 +#define CGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P 128 + +#define SGEMM_DEFAULT_Q 352 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 112 + +#define SGEMM_DEFAULT_R 4096 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + #elif defined(ARMV8SVE) || defined(A64FX) /* When all BLAS3 routines are implemeted with SVE, SGEMM_DEFAULT_UNROLL_M should be "sve_vl".