diff --git a/Makefile.system b/Makefile.system index 847bab179..6c1f01ba7 100644 --- a/Makefile.system +++ b/Makefile.system @@ -689,6 +689,7 @@ ifneq ($(NO_SVE), 1) DYNAMIC_CORE += NEOVERSEV1 DYNAMIC_CORE += NEOVERSEN2 DYNAMIC_CORE += ARMV8SVE +DYNAMIC_CORE += A64FX endif DYNAMIC_CORE += THUNDERX DYNAMIC_CORE += THUNDERX2T99 diff --git a/cmake/arch.cmake b/cmake/arch.cmake index eb974456b..27c5650ab 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -46,7 +46,7 @@ if (DYNAMIC_ARCH) if (ARM64) set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99) - set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE) + set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX) endif () if (DYNAMIC_LIST) set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST}) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index e64352f4a..609fbe241 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -1218,6 +1218,37 @@ endif () set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_N 4) set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "A64FX") + file(APPEND ${TARGET_CONF_TEMP} + "#define L1_CODE_SIZE\t65536\n" + "#define L1_CODE_LINESIZE\t256\n" + "#define L1_CODE_ASSOCIATIVE\t8\n" + "#define L1_DATA_SIZE\t32768\n" + "#define L1_DATA_LINESIZE\t256\n" + "#define L1_DATA_ASSOCIATIVE\t8\n" + "#define L2_SIZE\t8388608\n\n" + "#define L2_LINESIZE\t256\n" + "#define L2_ASSOCIATIVE\t8\n" + "#define L3_SIZE\t0\n\n" + "#define L3_LINESIZE\t0\n\n" + "#define L3_ASSOCIATIVE\t0\n\n" + "#define DTB_DEFAULT_ENTRIES\t64\n" + "#define DTB_SIZE\t4096\n" + "#define HAVE_VFPV4\n" + "#define HAVE_VFPV3\n" + "#define HAVE_VFP\n" + "#define HAVE_NEON\n" + "#define HAVE_SVE\n" + "#define ARMV8\n") + set(SGEMM_UNROLL_M 4) + set(SGEMM_UNROLL_N 8) + set(DGEMM_UNROLL_M 2) + set(DGEMM_UNROLL_N 8) + set(CGEMM_UNROLL_M 2) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 2) + set(ZGEMM_UNROLL_N 4) + set(SYMV_P 16) elseif ("${TCORE}" STREQUAL "P5600") file(APPEND ${TARGET_CONF_TEMP} "#define L2_SIZE 1048576\n" diff --git a/cmake/system.cmake b/cmake/system.cmake index e4778249f..b682c3af8 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -310,6 +310,18 @@ if (${TARGET} STREQUAL NEOVERSEV1) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.2-a+sve") endif() endif() + if (${TARGET} STREQUAL A64FX) + if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.2-a+sve -mtune=a64fx") + else () + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.2-a+sve -mtune=a64fx") + else () + message(FATAL_ERROR "Compiler $${CMAKE_C_COMPILER} {GCC_VERSION} does not support A64FX.") + endif() + endif() + endif() endif() diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 6b21028d1..dc88d816f 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -120,6 +120,11 @@ extern gotoblas_t gotoblas_CORTEXA55; #else #define gotoblas_CORTEXA55 gotoblas_ARMV8 #endif +#ifdef DYN_A64FX +extern gotoblas_t gotoblas_A64FX; +#else +#define gotoblas_A64FX gotoblas_ARMV8 +#endif #else extern gotoblas_t gotoblas_CORTEXA53; #define gotoblas_CORTEXA55 gotoblas_CORTEXA53 @@ -136,10 +141,12 @@ extern gotoblas_t gotoblas_NEOVERSEN1; extern gotoblas_t gotoblas_NEOVERSEV1; extern gotoblas_t gotoblas_NEOVERSEN2; extern gotoblas_t gotoblas_ARMV8SVE; +extern gotoblas_t gotoblas_A64FX; #else #define gotoblas_NEOVERSEV1 gotoblas_ARMV8 #define gotoblas_NEOVERSEN2 gotoblas_ARMV8 #define gotoblas_ARMV8SVE gotoblas_ARMV8 +#define gotoblas_A64FX gotoblas_ARMV8 #endif extern gotoblas_t gotoblas_THUNDERX3T110; #endif @@ -149,7 +156,7 @@ extern void openblas_warning(int verbose, const char * msg); #define FALLBACK_VERBOSE 1 #define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n" -#define NUM_CORETYPES 17 +#define NUM_CORETYPES 18 /* * In case asm/hwcap.h is outdated on the build system, make sure @@ -184,6 +191,7 @@ static char *corename[] = { "thunderx3t110", "cortexa55", "armv8sve", + "a64fx", "unknown" }; @@ -205,6 +213,7 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_THUNDERX3T110) return corename[14]; if (gotoblas == &gotoblas_CORTEXA55) return corename[15]; if (gotoblas == &gotoblas_ARMV8SVE) return corename[16]; + if (gotoblas == &gotoblas_A64FX) return corename[17]; return corename[NUM_CORETYPES]; } @@ -241,6 +250,7 @@ static gotoblas_t *force_coretype(char *coretype) { case 14: return (&gotoblas_THUNDERX3T110); case 15: return (&gotoblas_CORTEXA55); case 16: return (&gotoblas_ARMV8SVE); + case 17: return (&gotoblas_A64FX); } snprintf(message, 128, "Core not found: %s\n", coretype); openblas_warning(1, message); @@ -346,6 +356,15 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_THUNDERX3T110; } break; + case 0x46: // Fujitsu + switch (part) + { +#ifndef NO_SVE + case 0x001: // A64FX + return &gotoblas_A64FX; +#endif + } + break; case 0x48: // HiSilicon switch (part) {