diff --git a/Makefile.system b/Makefile.system index 7d26eccc3..62926b380 100644 --- a/Makefile.system +++ b/Makefile.system @@ -668,6 +668,7 @@ DYNAMIC_CORE += NEOVERSEN1 ifneq ($(NO_SVE), 1) DYNAMIC_CORE += NEOVERSEV1 DYNAMIC_CORE += NEOVERSEN2 +DYNAMIC_CORE += ARMV8SVE endif DYNAMIC_CORE += CORTEXA55 DYNAMIC_CORE += FALKOR diff --git a/cmake/arch.cmake b/cmake/arch.cmake index e6e434a0a..07df31b89 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -46,7 +46,7 @@ if (DYNAMIC_ARCH) if (ARM64) set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99) - set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2) + set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE) endif () if (DYNAMIC_LIST) set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST}) diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index ef2597234..530d18115 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -109,6 +110,11 @@ extern gotoblas_t gotoblas_NEOVERSEN2; #else #define gotoblas_NEOVERSEN2 gotoblas_ARMV8 #endif +#ifdef DYN_ARMV8SVE +extern gotoblas_t gotoblas_ARMV8SVE; +#else +#define gotoblas_ARMV8SVE gotoblas_ARMV8 +#endif #ifdef DYN_CORTEX_A55 extern gotoblas_t gotoblas_CORTEXA55; #else @@ -128,9 +134,11 @@ extern gotoblas_t gotoblas_NEOVERSEN1; #ifndef NO_SVE extern gotoblas_t gotoblas_NEOVERSEV1; extern gotoblas_t gotoblas_NEOVERSEN2; +extern gotoblas_t gotoblas_ARMV8SVE; #else #define gotoblas_NEOVERSEV1 gotoblas_ARMV8 #define gotoblas_NEOVERSEN2 gotoblas_ARMV8 +#define gotoblas_ARMV8SVE gotoblas_ARMV8 #endif extern gotoblas_t gotoblas_THUNDERX3T110; extern gotoblas_t gotoblas_CORTEXA55; @@ -140,7 +148,7 @@ extern void openblas_warning(int verbose, const char * msg); #define FALLBACK_VERBOSE 1 #define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n" -#define NUM_CORETYPES 13 +#define NUM_CORETYPES 16 /* * In case asm/hwcap.h is outdated on the build system, make sure @@ -173,6 +181,7 @@ static char *corename[] = { "neoversen2", "thunderx3t110", "cortexa55", + "armv8sve", "unknown" }; @@ -192,6 +201,7 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_NEOVERSEN2) return corename[12]; if (gotoblas == &gotoblas_THUNDERX3T110) return corename[13]; if (gotoblas == &gotoblas_CORTEXA55) return corename[14]; + if (gotoblas == &gotoblas_ARMV8SVE) return corename[15]; return corename[NUM_CORETYPES]; } @@ -226,6 +236,7 @@ static gotoblas_t *force_coretype(char *coretype) { case 12: return (&gotoblas_NEOVERSEN2); case 13: return (&gotoblas_THUNDERX3T110); case 14: return (&gotoblas_CORTEXA55); + case 15: return (&gotoblas_ARMV8SVE); } snprintf(message, 128, "Core not found: %s\n", coretype); openblas_warning(1, message); @@ -345,6 +356,12 @@ static gotoblas_t *get_coretype(void) { snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part); openblas_warning(1, coremsg); } +#ifndef NO_SVE + if ((getauxval(AT_HWCAP) & HWCAP_SVE)) { + return &gotoblas_ARMV8SVE; + } +#endif + return NULL; #endif } diff --git a/param.h b/param.h index aa193a284..84e0c2ac7 100644 --- a/param.h +++ b/param.h @@ -3371,7 +3371,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 -#elif defined(NEOVERSEV1) +#elif defined(NEOVERSEV1) // 256-bit SVE #if defined(XDOUBLE) || defined(DOUBLE) #define SWITCH_RATIO 8 @@ -3449,7 +3449,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 -#elif defined(ARMV8SVE) || defined(A64FX) || defined(ARMV9) || defined(CORTEXA510)|| defined(CORTEXA710) || defined(CORTEXX2) +#elif defined(A64FX) // 512-bit SVE /* When all BLAS3 routines are implemeted with SVE, SGEMM_DEFAULT_UNROLL_M should be "sve_vl". Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */ @@ -3490,6 +3490,43 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 +#elif defined(ARMV8SVE) || defined(ARMV9) || defined(CORTEXA510)|| defined(CORTEXA710) || defined(CORTEXX2) // 128-bit SVE + +#if defined(XDOUBLE) || defined(DOUBLE) +#define SWITCH_RATIO 8 +#else +#define SWITCH_RATIO 16 +#endif + +#define SGEMM_DEFAULT_UNROLL_M 4 // Actually 1VL (8) but kept seperate to keep copies seperate +#define SGEMM_DEFAULT_UNROLL_N 8 + +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 8 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_MN 16 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_MN 16 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 160 +#define CGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P 128 + +#define SGEMM_DEFAULT_Q 352 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 112 + +#define SGEMM_DEFAULT_R 4096 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + #else /* Other/undetected ARMv8 cores */ #define SGEMM_DEFAULT_UNROLL_M 16