Merge pull request #3508 from snadampal/v1_n2
OpenBLAS: aarch64: Add neoverse-v1/n2 architecture specifics
This commit is contained in:
commit
b6b024232d
|
@ -78,6 +78,66 @@ endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
# Use a72 tunings because Neoverse-V1 is only available
|
||||||
|
# in GCC>=9.4
|
||||||
|
ifeq ($(CORE), NEOVERSEV1)
|
||||||
|
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
|
||||||
|
ifeq ($(GCCVERSIONGTEQ9), 1)
|
||||||
|
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10)))
|
||||||
|
CCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1
|
||||||
|
ifneq ($(F_COMPILER), NAG)
|
||||||
|
FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1
|
||||||
|
endif
|
||||||
|
else
|
||||||
|
CCOMMON_OPT += -march=armv8.4-a -mtune=native
|
||||||
|
ifneq ($(F_COMPILER), NAG)
|
||||||
|
FCOMMON_OPT += -march=armv8.4-a -mtune=native
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
else
|
||||||
|
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
||||||
|
ifneq ($(F_COMPILER), NAG)
|
||||||
|
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
else
|
||||||
|
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||||
|
ifneq ($(F_COMPILER), NAG)
|
||||||
|
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
|
# Use a72 tunings because Neoverse-N2 is only available
|
||||||
|
# in GCC>=9.4
|
||||||
|
ifeq ($(CORE), NEOVERSEN2)
|
||||||
|
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
|
||||||
|
ifeq ($(GCCVERSIONGTEQ9), 1)
|
||||||
|
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10)))
|
||||||
|
CCOMMON_OPT += -march=armv8.5-a -mtune=neoverse-n2
|
||||||
|
ifneq ($(F_COMPILER), NAG)
|
||||||
|
FCOMMON_OPT += -march=armv8.5-a -mtune=neoverse-n2
|
||||||
|
endif
|
||||||
|
else
|
||||||
|
CCOMMON_OPT += -march=armv8.5-a -mtune=native
|
||||||
|
ifneq ($(F_COMPILER), NAG)
|
||||||
|
FCOMMON_OPT += -march=armv8.5-a -mtune=native
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
else
|
||||||
|
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
||||||
|
ifneq ($(F_COMPILER), NAG)
|
||||||
|
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
else
|
||||||
|
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||||
|
ifneq ($(F_COMPILER), NAG)
|
||||||
|
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
# Use a53 tunings because a55 is only available in GCC>=8.1
|
# Use a53 tunings because a55 is only available in GCC>=8.1
|
||||||
ifeq ($(CORE), CORTEXA55)
|
ifeq ($(CORE), CORTEXA55)
|
||||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
|
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
|
||||||
|
|
|
@ -374,6 +374,7 @@ else
|
||||||
endif
|
endif
|
||||||
GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 1)
|
GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 1)
|
||||||
GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 2)
|
GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 2)
|
||||||
|
GCCMINORVERSIONGTEQ4 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 4)
|
||||||
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7)
|
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
@ -654,6 +655,8 @@ DYNAMIC_CORE += CORTEXA57
|
||||||
DYNAMIC_CORE += CORTEXA72
|
DYNAMIC_CORE += CORTEXA72
|
||||||
DYNAMIC_CORE += CORTEXA73
|
DYNAMIC_CORE += CORTEXA73
|
||||||
DYNAMIC_CORE += NEOVERSEN1
|
DYNAMIC_CORE += NEOVERSEN1
|
||||||
|
DYNAMIC_CORE += NEOVERSEV1
|
||||||
|
DYNAMIC_CORE += NEOVERSEN2
|
||||||
DYNAMIC_CORE += CORTEXA55
|
DYNAMIC_CORE += CORTEXA55
|
||||||
DYNAMIC_CORE += FALKOR
|
DYNAMIC_CORE += FALKOR
|
||||||
DYNAMIC_CORE += THUNDERX
|
DYNAMIC_CORE += THUNDERX
|
||||||
|
|
|
@ -93,6 +93,8 @@ CORTEXA57
|
||||||
CORTEXA72
|
CORTEXA72
|
||||||
CORTEXA73
|
CORTEXA73
|
||||||
NEOVERSEN1
|
NEOVERSEN1
|
||||||
|
NEOVERSEV1
|
||||||
|
NEOVERSEN2
|
||||||
CORTEXA55
|
CORTEXA55
|
||||||
EMAG8180
|
EMAG8180
|
||||||
FALKOR
|
FALKOR
|
||||||
|
|
|
@ -44,7 +44,7 @@ endif ()
|
||||||
|
|
||||||
if (DYNAMIC_ARCH)
|
if (DYNAMIC_ARCH)
|
||||||
if (ARM64)
|
if (ARM64)
|
||||||
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
|
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 NEOVERSEV1 NEOVERSEN2 THUNDERX3T110)
|
||||||
if (DYNAMIC_LIST)
|
if (DYNAMIC_LIST)
|
||||||
set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST})
|
set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST})
|
||||||
endif ()
|
endif ()
|
||||||
|
|
|
@ -247,11 +247,11 @@ endif ()
|
||||||
"#define L1_CODE_ASSOCIATIVE\t4\n"
|
"#define L1_CODE_ASSOCIATIVE\t4\n"
|
||||||
"#define L1_DATA_SIZE\t65536\n"
|
"#define L1_DATA_SIZE\t65536\n"
|
||||||
"#define L1_DATA_LINESIZE\t64\n"
|
"#define L1_DATA_LINESIZE\t64\n"
|
||||||
"#define L1_DATA_ASSOCIATIVE\t2\n"
|
"#define L1_DATA_ASSOCIATIVE\t4\n"
|
||||||
"#define L2_SIZE\t1048576\n\n"
|
"#define L2_SIZE\t1048576\n\n"
|
||||||
"#define L2_LINESIZE\t64\n"
|
"#define L2_LINESIZE\t64\n"
|
||||||
"#define L2_ASSOCIATIVE\t16\n"
|
"#define L2_ASSOCIATIVE\t8\n"
|
||||||
"#define DTB_DEFAULT_ENTRIES\t64\n"
|
"#define DTB_DEFAULT_ENTRIES\t48\n"
|
||||||
"#define DTB_SIZE\t4096\n"
|
"#define DTB_SIZE\t4096\n"
|
||||||
"#define HAVE_VFPV4\n"
|
"#define HAVE_VFPV4\n"
|
||||||
"#define HAVE_VFPV3\n"
|
"#define HAVE_VFPV3\n"
|
||||||
|
@ -267,6 +267,62 @@ endif ()
|
||||||
set(ZGEMM_UNROLL_M 4)
|
set(ZGEMM_UNROLL_M 4)
|
||||||
set(ZGEMM_UNROLL_N 4)
|
set(ZGEMM_UNROLL_N 4)
|
||||||
set(SYMV_P 16)
|
set(SYMV_P 16)
|
||||||
|
elseif ("${TCORE}" STREQUAL "NEOVERSEV1")
|
||||||
|
file(APPEND ${TARGET_CONF_TEMP}
|
||||||
|
"#define L1_CODE_SIZE\t65536\n"
|
||||||
|
"#define L1_CODE_LINESIZE\t64\n"
|
||||||
|
"#define L1_CODE_ASSOCIATIVE\t4\n"
|
||||||
|
"#define L1_DATA_SIZE\t65536\n"
|
||||||
|
"#define L1_DATA_LINESIZE\t64\n"
|
||||||
|
"#define L1_DATA_ASSOCIATIVE\t4\n"
|
||||||
|
"#define L2_SIZE\t1048576\n\n"
|
||||||
|
"#define L2_LINESIZE\t64\n"
|
||||||
|
"#define L2_ASSOCIATIVE\t8\n"
|
||||||
|
"#define DTB_DEFAULT_ENTRIES\t48\n"
|
||||||
|
"#define DTB_SIZE\t4096\n"
|
||||||
|
"#define HAVE_VFPV4\n"
|
||||||
|
"#define HAVE_VFPV3\n"
|
||||||
|
"#define HAVE_VFP\n"
|
||||||
|
"#define HAVE_NEON\n"
|
||||||
|
"#define HAVE_SVE\n"
|
||||||
|
"#define ARMV8\n")
|
||||||
|
set(SGEMM_UNROLL_M 16)
|
||||||
|
set(SGEMM_UNROLL_N 4)
|
||||||
|
set(DGEMM_UNROLL_M 8)
|
||||||
|
set(DGEMM_UNROLL_N 4)
|
||||||
|
set(CGEMM_UNROLL_M 8)
|
||||||
|
set(CGEMM_UNROLL_N 4)
|
||||||
|
set(ZGEMM_UNROLL_M 4)
|
||||||
|
set(ZGEMM_UNROLL_N 4)
|
||||||
|
set(SYMV_P 16)
|
||||||
|
elseif ("${TCORE}" STREQUAL "NEOVERSEN2")
|
||||||
|
file(APPEND ${TARGET_CONF_TEMP}
|
||||||
|
"#define L1_CODE_SIZE\t65536\n"
|
||||||
|
"#define L1_CODE_LINESIZE\t64\n"
|
||||||
|
"#define L1_CODE_ASSOCIATIVE\t4\n"
|
||||||
|
"#define L1_DATA_SIZE\t65536\n"
|
||||||
|
"#define L1_DATA_LINESIZE\t64\n"
|
||||||
|
"#define L1_DATA_ASSOCIATIVE\t2\n"
|
||||||
|
"#define L2_SIZE\t1048576\n\n"
|
||||||
|
"#define L2_LINESIZE\t64\n"
|
||||||
|
"#define L2_ASSOCIATIVE\t8\n"
|
||||||
|
"#define DTB_DEFAULT_ENTRIES\t48\n"
|
||||||
|
"#define DTB_SIZE\t4096\n"
|
||||||
|
"#define HAVE_VFPV4\n"
|
||||||
|
"#define HAVE_VFPV3\n"
|
||||||
|
"#define HAVE_VFP\n"
|
||||||
|
"#define HAVE_NEON\n"
|
||||||
|
"#define HAVE_SVE\n"
|
||||||
|
"#define ARMV8\n")
|
||||||
|
set(SGEMM_UNROLL_M 16)
|
||||||
|
set(SGEMM_UNROLL_N 4)
|
||||||
|
set(DGEMM_UNROLL_M 8)
|
||||||
|
set(DGEMM_UNROLL_N 4)
|
||||||
|
set(CGEMM_UNROLL_M 8)
|
||||||
|
set(CGEMM_UNROLL_N 4)
|
||||||
|
set(ZGEMM_UNROLL_M 4)
|
||||||
|
set(ZGEMM_UNROLL_N 4)
|
||||||
|
set(SYMV_P 16)
|
||||||
elseif ("${TCORE}" STREQUAL "FALKOR")
|
elseif ("${TCORE}" STREQUAL "FALKOR")
|
||||||
file(APPEND ${TARGET_CONF_TEMP}
|
file(APPEND ${TARGET_CONF_TEMP}
|
||||||
"#define L1_CODE_SIZE\t65536\n"
|
"#define L1_CODE_SIZE\t65536\n"
|
||||||
|
|
|
@ -43,6 +43,8 @@ size_t length64=sizeof(value64);
|
||||||
#define CPU_CORTEXA72 4
|
#define CPU_CORTEXA72 4
|
||||||
#define CPU_CORTEXA73 5
|
#define CPU_CORTEXA73 5
|
||||||
#define CPU_NEOVERSEN1 11
|
#define CPU_NEOVERSEN1 11
|
||||||
|
#define CPU_NEOVERSEV1 16
|
||||||
|
#define CPU_NEOVERSEN2 17
|
||||||
// Qualcomm
|
// Qualcomm
|
||||||
#define CPU_FALKOR 6
|
#define CPU_FALKOR 6
|
||||||
// Cavium
|
// Cavium
|
||||||
|
@ -71,6 +73,8 @@ static char *cpuname[] = {
|
||||||
"TSV110",
|
"TSV110",
|
||||||
"EMAG8180",
|
"EMAG8180",
|
||||||
"NEOVERSEN1",
|
"NEOVERSEN1",
|
||||||
|
"NEOVERSEV1"
|
||||||
|
"NEOVERSEN2"
|
||||||
"THUNDERX3T110",
|
"THUNDERX3T110",
|
||||||
"VORTEX",
|
"VORTEX",
|
||||||
"CORTEXA55",
|
"CORTEXA55",
|
||||||
|
@ -90,6 +94,8 @@ static char *cpuname_lower[] = {
|
||||||
"tsv110",
|
"tsv110",
|
||||||
"emag8180",
|
"emag8180",
|
||||||
"neoversen1",
|
"neoversen1",
|
||||||
|
"neoversev1",
|
||||||
|
"neoversen2",
|
||||||
"thunderx3t110",
|
"thunderx3t110",
|
||||||
"vortex",
|
"vortex",
|
||||||
"cortexa55",
|
"cortexa55",
|
||||||
|
@ -170,6 +176,10 @@ int detect(void)
|
||||||
return CPU_CORTEXA73;
|
return CPU_CORTEXA73;
|
||||||
else if (strstr(cpu_part, "0xd0c"))
|
else if (strstr(cpu_part, "0xd0c"))
|
||||||
return CPU_NEOVERSEN1;
|
return CPU_NEOVERSEN1;
|
||||||
|
else if (strstr(cpu_part, "0xd40"))
|
||||||
|
return CPU_NEOVERSEV1;
|
||||||
|
else if (strstr(cpu_part, "0xd49"))
|
||||||
|
return CPU_NEOVERSEN2;
|
||||||
else if (strstr(cpu_part, "0xd05"))
|
else if (strstr(cpu_part, "0xd05"))
|
||||||
return CPU_CORTEXA55;
|
return CPU_CORTEXA55;
|
||||||
}
|
}
|
||||||
|
@ -338,11 +348,41 @@ void get_cpuconfig(void)
|
||||||
printf("#define L1_DATA_ASSOCIATIVE 4\n");
|
printf("#define L1_DATA_ASSOCIATIVE 4\n");
|
||||||
printf("#define L2_SIZE 1048576\n");
|
printf("#define L2_SIZE 1048576\n");
|
||||||
printf("#define L2_LINESIZE 64\n");
|
printf("#define L2_LINESIZE 64\n");
|
||||||
printf("#define L2_ASSOCIATIVE 16\n");
|
printf("#define L2_ASSOCIATIVE 8\n");
|
||||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
printf("#define DTB_DEFAULT_ENTRIES 48\n");
|
||||||
printf("#define DTB_SIZE 4096\n");
|
printf("#define DTB_SIZE 4096\n");
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case CPU_NEOVERSEV1:
|
||||||
|
printf("#define %s\n", cpuname[d]);
|
||||||
|
printf("#define L1_CODE_SIZE 65536\n");
|
||||||
|
printf("#define L1_CODE_LINESIZE 64\n");
|
||||||
|
printf("#define L1_CODE_ASSOCIATIVE 4\n");
|
||||||
|
printf("#define L1_DATA_SIZE 65536\n");
|
||||||
|
printf("#define L1_DATA_LINESIZE 64\n");
|
||||||
|
printf("#define L1_DATA_ASSOCIATIVE 4\n");
|
||||||
|
printf("#define L2_SIZE 1048576\n");
|
||||||
|
printf("#define L2_LINESIZE 64\n");
|
||||||
|
printf("#define L2_ASSOCIATIVE 8\n");
|
||||||
|
printf("#define DTB_DEFAULT_ENTRIES 48\n");
|
||||||
|
printf("#define DTB_SIZE 4096\n");
|
||||||
|
break;
|
||||||
|
|
||||||
|
case CPU_NEOVERSEN2:
|
||||||
|
printf("#define %s\n", cpuname[d]);
|
||||||
|
printf("#define L1_CODE_SIZE 65536\n");
|
||||||
|
printf("#define L1_CODE_LINESIZE 64\n");
|
||||||
|
printf("#define L1_CODE_ASSOCIATIVE 4\n");
|
||||||
|
printf("#define L1_DATA_SIZE 65536\n");
|
||||||
|
printf("#define L1_DATA_LINESIZE 64\n");
|
||||||
|
printf("#define L1_DATA_ASSOCIATIVE 4\n");
|
||||||
|
printf("#define L2_SIZE 1048576\n");
|
||||||
|
printf("#define L2_LINESIZE 64\n");
|
||||||
|
printf("#define L2_ASSOCIATIVE 8\n");
|
||||||
|
printf("#define DTB_DEFAULT_ENTRIES 48\n");
|
||||||
|
printf("#define DTB_SIZE 4096\n");
|
||||||
|
break;
|
||||||
|
|
||||||
case CPU_FALKOR:
|
case CPU_FALKOR:
|
||||||
printf("#define FALKOR\n");
|
printf("#define FALKOR\n");
|
||||||
printf("#define L1_CODE_SIZE 65536\n");
|
printf("#define L1_CODE_SIZE 65536\n");
|
||||||
|
|
|
@ -147,6 +147,8 @@ static char *corename[] = {
|
||||||
"tsv110",
|
"tsv110",
|
||||||
"emag8180",
|
"emag8180",
|
||||||
"neoversen1",
|
"neoversen1",
|
||||||
|
"neoversev1",
|
||||||
|
"neoversen2",
|
||||||
"thunderx3t110",
|
"thunderx3t110",
|
||||||
"cortexa55",
|
"cortexa55",
|
||||||
"unknown"
|
"unknown"
|
||||||
|
|
37
getarch.c
37
getarch.c
|
@ -1302,12 +1302,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
"-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
|
"-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
|
||||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8 " \
|
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8 " \
|
||||||
"-march=armv8.2-a -mtune=cortex-a72"
|
"-march=armv8.2-a -mtune=neoverse-n1"
|
||||||
#define LIBNAME "neoversen1"
|
#define LIBNAME "neoversen1"
|
||||||
#define CORENAME "NEOVERSEN1"
|
#define CORENAME "NEOVERSEN1"
|
||||||
#else
|
#else
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef FORCE_NEOVERSEV1
|
||||||
|
#define FORCE
|
||||||
|
#define ARCHITECTURE "ARM64"
|
||||||
|
#define SUBARCHITECTURE "NEOVERSEV1"
|
||||||
|
#define SUBDIRNAME "arm64"
|
||||||
|
#define ARCHCONFIG "-DNEOVERSEV1 " \
|
||||||
|
"-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \
|
||||||
|
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \
|
||||||
|
"-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
|
||||||
|
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||||
|
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 " \
|
||||||
|
"-march=armv8.4-a -mtune=neoverse-v1"
|
||||||
|
#define LIBNAME "neoversev1"
|
||||||
|
#define CORENAME "NEOVERSEV1"
|
||||||
|
#else
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef FORCE_NEOVERSEN2
|
||||||
|
#define FORCE
|
||||||
|
#define ARCHITECTURE "ARM64"
|
||||||
|
#define SUBARCHITECTURE "NEOVERSEN2"
|
||||||
|
#define SUBDIRNAME "arm64"
|
||||||
|
#define ARCHCONFIG "-DNEOVERSEN2 " \
|
||||||
|
"-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \
|
||||||
|
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \
|
||||||
|
"-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
|
||||||
|
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||||
|
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 " \
|
||||||
|
"-march=armv8.5-a -mtune=neoverse-n2"
|
||||||
|
#define LIBNAME "neoversen2"
|
||||||
|
#define CORENAME "NEOVERSEN2"
|
||||||
|
#else
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef FORCE_CORTEXA55
|
#ifdef FORCE_CORTEXA55
|
||||||
#define FORCE
|
#define FORCE
|
||||||
#define ARCHITECTURE "ARM64"
|
#define ARCHITECTURE "ARM64"
|
||||||
|
|
|
@ -0,0 +1,189 @@
|
||||||
|
SAMINKERNEL = ../arm/amin.c
|
||||||
|
DAMINKERNEL = ../arm/amin.c
|
||||||
|
CAMINKERNEL = ../arm/zamin.c
|
||||||
|
ZAMINKERNEL = ../arm/zamin.c
|
||||||
|
|
||||||
|
SMAXKERNEL = ../arm/max.c
|
||||||
|
DMAXKERNEL = ../arm/max.c
|
||||||
|
|
||||||
|
SMINKERNEL = ../arm/min.c
|
||||||
|
DMINKERNEL = ../arm/min.c
|
||||||
|
|
||||||
|
ISAMINKERNEL = ../arm/iamin.c
|
||||||
|
IDAMINKERNEL = ../arm/iamin.c
|
||||||
|
ICAMINKERNEL = ../arm/izamin.c
|
||||||
|
IZAMINKERNEL = ../arm/izamin.c
|
||||||
|
|
||||||
|
ISMAXKERNEL = ../arm/imax.c
|
||||||
|
IDMAXKERNEL = ../arm/imax.c
|
||||||
|
|
||||||
|
ISMINKERNEL = ../arm/imin.c
|
||||||
|
IDMINKERNEL = ../arm/imin.c
|
||||||
|
|
||||||
|
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
|
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
|
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
|
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
|
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
SAMAXKERNEL = amax.S
|
||||||
|
DAMAXKERNEL = amax.S
|
||||||
|
CAMAXKERNEL = zamax.S
|
||||||
|
ZAMAXKERNEL = zamax.S
|
||||||
|
|
||||||
|
SAXPYKERNEL = axpy.S
|
||||||
|
DAXPYKERNEL = daxpy_thunderx2t99.S
|
||||||
|
CAXPYKERNEL = zaxpy.S
|
||||||
|
ZAXPYKERNEL = zaxpy.S
|
||||||
|
|
||||||
|
SROTKERNEL = rot.S
|
||||||
|
DROTKERNEL = rot.S
|
||||||
|
CROTKERNEL = zrot.S
|
||||||
|
ZROTKERNEL = zrot.S
|
||||||
|
|
||||||
|
SSCALKERNEL = scal.S
|
||||||
|
DSCALKERNEL = scal.S
|
||||||
|
CSCALKERNEL = zscal.S
|
||||||
|
ZSCALKERNEL = zscal.S
|
||||||
|
|
||||||
|
SGEMVNKERNEL = gemv_n.S
|
||||||
|
DGEMVNKERNEL = gemv_n.S
|
||||||
|
CGEMVNKERNEL = zgemv_n.S
|
||||||
|
ZGEMVNKERNEL = zgemv_n.S
|
||||||
|
|
||||||
|
SGEMVTKERNEL = gemv_t.S
|
||||||
|
DGEMVTKERNEL = gemv_t.S
|
||||||
|
CGEMVTKERNEL = zgemv_t.S
|
||||||
|
ZGEMVTKERNEL = zgemv_t.S
|
||||||
|
|
||||||
|
|
||||||
|
SASUMKERNEL = sasum_thunderx2t99.c
|
||||||
|
DASUMKERNEL = dasum_thunderx2t99.c
|
||||||
|
CASUMKERNEL = casum_thunderx2t99.c
|
||||||
|
ZASUMKERNEL = zasum_thunderx2t99.c
|
||||||
|
|
||||||
|
SCOPYKERNEL = copy_thunderx2t99.c
|
||||||
|
DCOPYKERNEL = copy_thunderx2t99.c
|
||||||
|
CCOPYKERNEL = copy_thunderx2t99.c
|
||||||
|
ZCOPYKERNEL = copy_thunderx2t99.c
|
||||||
|
|
||||||
|
SSWAPKERNEL = swap_thunderx2t99.S
|
||||||
|
DSWAPKERNEL = swap_thunderx2t99.S
|
||||||
|
CSWAPKERNEL = swap_thunderx2t99.S
|
||||||
|
ZSWAPKERNEL = swap_thunderx2t99.S
|
||||||
|
|
||||||
|
ISAMAXKERNEL = iamax_thunderx2t99.c
|
||||||
|
IDAMAXKERNEL = iamax_thunderx2t99.c
|
||||||
|
ICAMAXKERNEL = izamax_thunderx2t99.c
|
||||||
|
IZAMAXKERNEL = izamax_thunderx2t99.c
|
||||||
|
|
||||||
|
SNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||||
|
DNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||||
|
CNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||||
|
ZNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||||
|
|
||||||
|
DDOTKERNEL = dot_thunderx2t99.c
|
||||||
|
SDOTKERNEL = dot_thunderx2t99.c
|
||||||
|
CDOTKERNEL = zdot_thunderx2t99.c
|
||||||
|
ZDOTKERNEL = zdot_thunderx2t99.c
|
||||||
|
DSDOTKERNEL = dot.S
|
||||||
|
|
||||||
|
DGEMM_BETA = dgemm_beta.S
|
||||||
|
SGEMM_BETA = sgemm_beta.S
|
||||||
|
|
||||||
|
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||||
|
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||||
|
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||||
|
ifeq ($(SGEMM_UNROLL_M), 16)
|
||||||
|
SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S
|
||||||
|
else
|
||||||
|
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
|
||||||
|
endif
|
||||||
|
ifeq ($(SGEMM_UNROLL_M), 4)
|
||||||
|
SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S
|
||||||
|
else
|
||||||
|
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
|
||||||
|
endif
|
||||||
|
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
endif
|
||||||
|
ifeq ($(SGEMM_UNROLL_N), 16)
|
||||||
|
SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S
|
||||||
|
else
|
||||||
|
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
|
||||||
|
endif
|
||||||
|
ifeq ($(SGEMM_UNROLL_N), 4)
|
||||||
|
SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
|
||||||
|
else
|
||||||
|
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
|
||||||
|
endif
|
||||||
|
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
|
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||||
|
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||||
|
|
||||||
|
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
|
||||||
|
|
||||||
|
ifeq ($(DGEMM_UNROLL_M), 8)
|
||||||
|
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
|
||||||
|
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
|
||||||
|
else
|
||||||
|
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
|
||||||
|
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
|
||||||
|
endif
|
||||||
|
|
||||||
|
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(DGEMM_UNROLL_N), 4)
|
||||||
|
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
|
||||||
|
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
|
||||||
|
else
|
||||||
|
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
|
||||||
|
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
|
||||||
|
endif
|
||||||
|
|
||||||
|
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
|
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||||
|
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||||
|
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
|
||||||
|
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
|
||||||
|
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
|
||||||
|
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
endif
|
||||||
|
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
|
||||||
|
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
|
||||||
|
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
|
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||||
|
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||||
|
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
|
||||||
|
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
|
||||||
|
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
|
||||||
|
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
endif
|
||||||
|
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||||
|
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||||
|
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
|
@ -0,0 +1,189 @@
|
||||||
|
SAMINKERNEL = ../arm/amin.c
|
||||||
|
DAMINKERNEL = ../arm/amin.c
|
||||||
|
CAMINKERNEL = ../arm/zamin.c
|
||||||
|
ZAMINKERNEL = ../arm/zamin.c
|
||||||
|
|
||||||
|
SMAXKERNEL = ../arm/max.c
|
||||||
|
DMAXKERNEL = ../arm/max.c
|
||||||
|
|
||||||
|
SMINKERNEL = ../arm/min.c
|
||||||
|
DMINKERNEL = ../arm/min.c
|
||||||
|
|
||||||
|
ISAMINKERNEL = ../arm/iamin.c
|
||||||
|
IDAMINKERNEL = ../arm/iamin.c
|
||||||
|
ICAMINKERNEL = ../arm/izamin.c
|
||||||
|
IZAMINKERNEL = ../arm/izamin.c
|
||||||
|
|
||||||
|
ISMAXKERNEL = ../arm/imax.c
|
||||||
|
IDMAXKERNEL = ../arm/imax.c
|
||||||
|
|
||||||
|
ISMINKERNEL = ../arm/imin.c
|
||||||
|
IDMINKERNEL = ../arm/imin.c
|
||||||
|
|
||||||
|
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
|
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
|
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
|
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
|
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
SAMAXKERNEL = amax.S
|
||||||
|
DAMAXKERNEL = amax.S
|
||||||
|
CAMAXKERNEL = zamax.S
|
||||||
|
ZAMAXKERNEL = zamax.S
|
||||||
|
|
||||||
|
SAXPYKERNEL = axpy.S
|
||||||
|
DAXPYKERNEL = daxpy_thunderx2t99.S
|
||||||
|
CAXPYKERNEL = zaxpy.S
|
||||||
|
ZAXPYKERNEL = zaxpy.S
|
||||||
|
|
||||||
|
SROTKERNEL = rot.S
|
||||||
|
DROTKERNEL = rot.S
|
||||||
|
CROTKERNEL = zrot.S
|
||||||
|
ZROTKERNEL = zrot.S
|
||||||
|
|
||||||
|
SSCALKERNEL = scal.S
|
||||||
|
DSCALKERNEL = scal.S
|
||||||
|
CSCALKERNEL = zscal.S
|
||||||
|
ZSCALKERNEL = zscal.S
|
||||||
|
|
||||||
|
SGEMVNKERNEL = gemv_n.S
|
||||||
|
DGEMVNKERNEL = gemv_n.S
|
||||||
|
CGEMVNKERNEL = zgemv_n.S
|
||||||
|
ZGEMVNKERNEL = zgemv_n.S
|
||||||
|
|
||||||
|
SGEMVTKERNEL = gemv_t.S
|
||||||
|
DGEMVTKERNEL = gemv_t.S
|
||||||
|
CGEMVTKERNEL = zgemv_t.S
|
||||||
|
ZGEMVTKERNEL = zgemv_t.S
|
||||||
|
|
||||||
|
|
||||||
|
SASUMKERNEL = sasum_thunderx2t99.c
|
||||||
|
DASUMKERNEL = dasum_thunderx2t99.c
|
||||||
|
CASUMKERNEL = casum_thunderx2t99.c
|
||||||
|
ZASUMKERNEL = zasum_thunderx2t99.c
|
||||||
|
|
||||||
|
SCOPYKERNEL = copy_thunderx2t99.c
|
||||||
|
DCOPYKERNEL = copy_thunderx2t99.c
|
||||||
|
CCOPYKERNEL = copy_thunderx2t99.c
|
||||||
|
ZCOPYKERNEL = copy_thunderx2t99.c
|
||||||
|
|
||||||
|
SSWAPKERNEL = swap_thunderx2t99.S
|
||||||
|
DSWAPKERNEL = swap_thunderx2t99.S
|
||||||
|
CSWAPKERNEL = swap_thunderx2t99.S
|
||||||
|
ZSWAPKERNEL = swap_thunderx2t99.S
|
||||||
|
|
||||||
|
ISAMAXKERNEL = iamax_thunderx2t99.c
|
||||||
|
IDAMAXKERNEL = iamax_thunderx2t99.c
|
||||||
|
ICAMAXKERNEL = izamax_thunderx2t99.c
|
||||||
|
IZAMAXKERNEL = izamax_thunderx2t99.c
|
||||||
|
|
||||||
|
SNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||||
|
DNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||||
|
CNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||||
|
ZNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||||
|
|
||||||
|
DDOTKERNEL = dot_thunderx2t99.c
|
||||||
|
SDOTKERNEL = dot_thunderx2t99.c
|
||||||
|
CDOTKERNEL = zdot_thunderx2t99.c
|
||||||
|
ZDOTKERNEL = zdot_thunderx2t99.c
|
||||||
|
DSDOTKERNEL = dot.S
|
||||||
|
|
||||||
|
DGEMM_BETA = dgemm_beta.S
|
||||||
|
SGEMM_BETA = sgemm_beta.S
|
||||||
|
|
||||||
|
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||||
|
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||||
|
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||||
|
ifeq ($(SGEMM_UNROLL_M), 16)
|
||||||
|
SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S
|
||||||
|
else
|
||||||
|
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
|
||||||
|
endif
|
||||||
|
ifeq ($(SGEMM_UNROLL_M), 4)
|
||||||
|
SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S
|
||||||
|
else
|
||||||
|
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
|
||||||
|
endif
|
||||||
|
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
endif
|
||||||
|
ifeq ($(SGEMM_UNROLL_N), 16)
|
||||||
|
SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S
|
||||||
|
else
|
||||||
|
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
|
||||||
|
endif
|
||||||
|
ifeq ($(SGEMM_UNROLL_N), 4)
|
||||||
|
SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
|
||||||
|
else
|
||||||
|
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
|
||||||
|
endif
|
||||||
|
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
|
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||||
|
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||||
|
|
||||||
|
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
|
||||||
|
|
||||||
|
ifeq ($(DGEMM_UNROLL_M), 8)
|
||||||
|
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
|
||||||
|
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
|
||||||
|
else
|
||||||
|
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
|
||||||
|
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
|
||||||
|
endif
|
||||||
|
|
||||||
|
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(DGEMM_UNROLL_N), 4)
|
||||||
|
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
|
||||||
|
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
|
||||||
|
else
|
||||||
|
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
|
||||||
|
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
|
||||||
|
endif
|
||||||
|
|
||||||
|
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
|
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||||
|
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||||
|
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
|
||||||
|
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
|
||||||
|
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
|
||||||
|
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
endif
|
||||||
|
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
|
||||||
|
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
|
||||||
|
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
|
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||||
|
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||||
|
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
|
||||||
|
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
|
||||||
|
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
|
||||||
|
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
endif
|
||||||
|
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||||
|
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||||
|
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
58
param.h
58
param.h
|
@ -3307,6 +3307,64 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
|
||||||
#define CGEMM_DEFAULT_R 4096
|
#define CGEMM_DEFAULT_R 4096
|
||||||
#define ZGEMM_DEFAULT_R 4096
|
#define ZGEMM_DEFAULT_R 4096
|
||||||
|
|
||||||
|
#elif defined(NEOVERSEV1)
|
||||||
|
|
||||||
|
#define SGEMM_DEFAULT_UNROLL_M 16
|
||||||
|
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||||
|
|
||||||
|
#define DGEMM_DEFAULT_UNROLL_M 8
|
||||||
|
#define DGEMM_DEFAULT_UNROLL_N 4
|
||||||
|
|
||||||
|
#define CGEMM_DEFAULT_UNROLL_M 8
|
||||||
|
#define CGEMM_DEFAULT_UNROLL_N 4
|
||||||
|
|
||||||
|
#define ZGEMM_DEFAULT_UNROLL_M 4
|
||||||
|
#define ZGEMM_DEFAULT_UNROLL_N 4
|
||||||
|
|
||||||
|
#define SGEMM_DEFAULT_P 128
|
||||||
|
#define DGEMM_DEFAULT_P 160
|
||||||
|
#define CGEMM_DEFAULT_P 128
|
||||||
|
#define ZGEMM_DEFAULT_P 128
|
||||||
|
|
||||||
|
#define SGEMM_DEFAULT_Q 352
|
||||||
|
#define DGEMM_DEFAULT_Q 128
|
||||||
|
#define CGEMM_DEFAULT_Q 224
|
||||||
|
#define ZGEMM_DEFAULT_Q 112
|
||||||
|
|
||||||
|
#define SGEMM_DEFAULT_R 4096
|
||||||
|
#define DGEMM_DEFAULT_R 4096
|
||||||
|
#define CGEMM_DEFAULT_R 4096
|
||||||
|
#define ZGEMM_DEFAULT_R 4096
|
||||||
|
|
||||||
|
#elif defined(NEOVERSEN2)
|
||||||
|
|
||||||
|
#define SGEMM_DEFAULT_UNROLL_M 16
|
||||||
|
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||||
|
|
||||||
|
#define DGEMM_DEFAULT_UNROLL_M 8
|
||||||
|
#define DGEMM_DEFAULT_UNROLL_N 4
|
||||||
|
|
||||||
|
#define CGEMM_DEFAULT_UNROLL_M 8
|
||||||
|
#define CGEMM_DEFAULT_UNROLL_N 4
|
||||||
|
|
||||||
|
#define ZGEMM_DEFAULT_UNROLL_M 4
|
||||||
|
#define ZGEMM_DEFAULT_UNROLL_N 4
|
||||||
|
|
||||||
|
#define SGEMM_DEFAULT_P 128
|
||||||
|
#define DGEMM_DEFAULT_P 160
|
||||||
|
#define CGEMM_DEFAULT_P 128
|
||||||
|
#define ZGEMM_DEFAULT_P 128
|
||||||
|
|
||||||
|
#define SGEMM_DEFAULT_Q 352
|
||||||
|
#define DGEMM_DEFAULT_Q 128
|
||||||
|
#define CGEMM_DEFAULT_Q 224
|
||||||
|
#define ZGEMM_DEFAULT_Q 112
|
||||||
|
|
||||||
|
#define SGEMM_DEFAULT_R 4096
|
||||||
|
#define DGEMM_DEFAULT_R 4096
|
||||||
|
#define CGEMM_DEFAULT_R 4096
|
||||||
|
#define ZGEMM_DEFAULT_R 4096
|
||||||
|
|
||||||
#elif defined(ARMV8SVE) || defined(A64FX)
|
#elif defined(ARMV8SVE) || defined(A64FX)
|
||||||
|
|
||||||
/* When all BLAS3 routines are implemeted with SVE, SGEMM_DEFAULT_UNROLL_M should be "sve_vl".
|
/* When all BLAS3 routines are implemeted with SVE, SGEMM_DEFAULT_UNROLL_M should be "sve_vl".
|
||||||
|
|
Loading…
Reference in New Issue